Reputation: 407
I'm trying to run a simulation that is based on gaussian pulses propagation. I'm cross developing between my windows desktop with an i5 4590 & GTX 970 (latest drivers) and my early 2015 macbook air.
When running my main code i couldn't get any decent results on my desktop (the calculations diverged) but on my mac the results seemed ok.
To investigate the problem further I've tried to run a simples gaussian propagation. The results on the macbook are more or less ok, while on the desktop it's a complete mess.
I'm running the same code on both machines and both have the same distribution of python (2.7.10) and the respective modules.
Here is my code
import scipy as sp
import pyopencl as cl
import matplotlib.pyplot as plot
ctx = cl.create_some_context()
queue = cl.CommandQueue(ctx)
MF = cl.mem_flags
dx = 0.01
X = sp.arange(0.0, 100, dx)
N = len(X)
A_h = (sp.exp(-(X-50)**2/10.)*sp.exp(-1j*1000*X)).astype(sp.complex64)
A_d = cl.Buffer(ctx, MF.READ_WRITE | MF.COPY_HOST_PTR, hostbuf=A_h)
plot.figure()
plot.plot(X, abs(A_h) ** 2 / (abs(A_h) ** 2).max())
Source = """
#define complex_ctr(x, y) (float2)(x, y)
#define complex_mul(a, b) complex_ctr(mad(-(a).y, (b).y, (a).x * (b).x), mad((a).y, (b).x, (a).x * (b).y))
#define complex_unit (float2)(0, 1)
__kernel void propagate(__global float2 *A){
const int gid_x = get_global_id(0);
float EPS = 0.1f;
A[gid_x] = A[gid_x] + EPS*complex_mul((A[gid_x+1] + A[gid_x-1]), complex_unit);
}
"""
prg = cl.Program(ctx, Source).build()
for i in range(3000):
print i
event = prg.propagate(queue, (N,), None, A_d)
event.wait()
cl.enqueue_copy(queue, A_h, A_d)
plot.plot(X, abs(A_h) ** 2 / (abs(A_h) ** 2).max())
plot.show()
And here are the results
Mac result:
Green line corresponds to the gaussian after propagation and the Blue line is the initial gaussian
What may cause this issue on the NVidia side? I think i'm missing a crucial step to prevent this from happening and that it runs on the mac somewhat due to luck
EDIT
This is my final code that is working based on the user's suggestions
import scipy as sp
import pyopencl as cl
import matplotlib.pyplot as plot
ctx = cl.create_some_context()
queue = cl.CommandQueue(ctx)
MF = cl.mem_flags
dx = sp.float32(0.001)
X = sp.arange(0.0, 100, dx).astype(sp.float32)
N = len(X)
A_h = (sp.exp(-(X-50)**2/10.)*sp.exp(1j*1000*X)).astype(sp.complex64)
A_d = cl.Buffer(ctx, MF.READ_WRITE | MF.COPY_HOST_PTR, hostbuf=A_h)
B_d = cl.Buffer(ctx, MF.READ_WRITE | MF.COPY_HOST_PTR, hostbuf=A_h)
plot.figure()
plot.plot(X, abs(A_h) ** 2 / (abs(A_h) ** 2).max())
Source = """
#define complex_ctr(x, y) (float2)(x, y)
#define complex_mul(a, b) complex_ctr((a).x * (b).x - (a).y * (b).y, (a).x * (b).y + (a).y * (b).x)
#define complex_unit (float2)(0, 1)
__kernel void propagate(__global float2 *A){
const int gid_x = get_global_id(0);
float EPS = 0.1f;
float2 a1, a2;
a1 = A[gid_x-1];
a2 = A[gid_x+1];
barrier(CLK_GLOBAL_MEM_FENCE);
A[gid_x] += EPS*complex_mul((a1 + a2), complex_unit);
}
"""
prg = cl.Program(ctx, Source).build()
for i in range(12000):
print i
evolve = prg.propagate(queue, (N,), None, A_d)
evolve.wait()
cl.enqueue_copy(queue, A_h, A_d)
plot.plot(X, abs(A_h) ** 2)
plot.show()
Upvotes: 0
Views: 196
Reputation: 8410
EDIT: oh, just read @talonmies comment, it is same solution as this.
This code is not safe in OpenCL, it has datarace problem:
A[gid_x] = A[gid_x] + EPS*complex_mul((A[gid_x+1] + A[gid_x-1]), complex_unit);
Every work item x
uses x+1
and x-1
. Depending on the schedule of work items the result will be different.
Use 2 buffers instead, read from A, write to B, easy:
import scipy as sp
import pyopencl as cl
import matplotlib.pyplot as plot
ctx = cl.create_some_context()
queue = cl.CommandQueue(ctx)
MF = cl.mem_flags
dx = 0.01
X = sp.arange(0.0, 100, dx)
N = len(X)
A_h = (sp.exp(-(X-50)**2/10.)*sp.exp(-1j*1000*X)).astype(sp.complex64)
A_d = cl.Buffer(ctx, MF.READ_WRITE | MF.COPY_HOST_PTR, hostbuf=A_h)
B_d = cl.Buffer(ctx, MF.READ_WRITE)
plot.figure()
plot.plot(X, abs(A_h) ** 2 / (abs(A_h) ** 2).max())
Source = """
#define complex_ctr(x, y) (float2)(x, y)
#define complex_mul(a, b) complex_ctr(mad(-(a).y, (b).y, (a).x * (b).x), mad((a).y, (b).x, (a).x * (b).y))
#define complex_unit (float2)(0, 1)
__kernel void propagate(__global float2 *A, __global float2 *B){
const int gid_x = get_global_id(0);
float EPS = 0.1f;
B[gid_x] = A[gid_x] + EPS*complex_mul((A[gid_x+1] + A[gid_x-1]), complex_unit);
}
"""
prg = cl.Program(ctx, Source).build()
for i in range(3000):
print i
event = prg.propagate(queue, (N,), None, A_d, B_d)
A_d, B_d = B_d, A_d #Swap buffers, so A always has results
#event.wait() #You don't need this, this is slowing terribly the execution, enqueue_copy already waits
cl.enqueue_copy(queue, A_h, A_d)
plot.plot(X, abs(A_h) ** 2 / (abs(A_h) ** 2).max())
plot.show()
Upvotes: 3