PyOpenCL - Different results on Intel and NVidia

Question

I'm trying to run a simulation that is based on gaussian pulses propagation. I'm cross developing between my windows desktop with an i5 4590 & GTX 970 (latest drivers) and my early 2015 macbook air.

When running my main code i couldn't get any decent results on my desktop (the calculations diverged) but on my mac the results seemed ok.

To investigate the problem further I've tried to run a simples gaussian propagation. The results on the macbook are more or less ok, while on the desktop it's a complete mess.

I'm running the same code on both machines and both have the same distribution of python (2.7.10) and the respective modules.

Here is my code

import scipy as sp
import pyopencl as cl
import matplotlib.pyplot as plot

ctx = cl.create_some_context()
queue = cl.CommandQueue(ctx)
MF = cl.mem_flags

dx = 0.01
X = sp.arange(0.0, 100, dx)
N = len(X)

A_h = (sp.exp(-(X-50)**2/10.)*sp.exp(-1j*1000*X)).astype(sp.complex64)
A_d = cl.Buffer(ctx, MF.READ_WRITE | MF.COPY_HOST_PTR, hostbuf=A_h)

plot.figure()
plot.plot(X, abs(A_h) ** 2 / (abs(A_h) ** 2).max())

Source = """
    #define complex_ctr(x, y) (float2)(x, y)
    #define complex_mul(a, b) complex_ctr(mad(-(a).y, (b).y, (a).x * (b).x), mad((a).y, (b).x, (a).x * (b).y))
    #define complex_unit (float2)(0, 1)

    __kernel void propagate(__global float2 *A){
        const int gid_x = get_global_id(0);
        float EPS = 0.1f;
        A[gid_x] = A[gid_x] + EPS*complex_mul((A[gid_x+1] + A[gid_x-1]), complex_unit);
    }
"""
prg = cl.Program(ctx, Source).build()
for i in range(3000):
    print i
    event = prg.propagate(queue, (N,), None, A_d)
    event.wait()
cl.enqueue_copy(queue, A_h, A_d)

plot.plot(X, abs(A_h) ** 2 / (abs(A_h) ** 2).max())

plot.show()

And here are the results

Desktop result:

Mac result:

Green line corresponds to the gaussian after propagation and the Blue line is the initial gaussian

What may cause this issue on the NVidia side? I think i'm missing a crucial step to prevent this from happening and that it runs on the mac somewhat due to luck

EDIT

This is my final code that is working based on the user's suggestions

import scipy as sp
import pyopencl as cl
import matplotlib.pyplot as plot

ctx = cl.create_some_context()
queue = cl.CommandQueue(ctx)
MF = cl.mem_flags

dx = sp.float32(0.001)
X = sp.arange(0.0, 100, dx).astype(sp.float32)
N = len(X)

A_h = (sp.exp(-(X-50)**2/10.)*sp.exp(1j*1000*X)).astype(sp.complex64)
A_d = cl.Buffer(ctx, MF.READ_WRITE | MF.COPY_HOST_PTR, hostbuf=A_h)
B_d = cl.Buffer(ctx, MF.READ_WRITE | MF.COPY_HOST_PTR, hostbuf=A_h)

plot.figure()
plot.plot(X, abs(A_h) ** 2 / (abs(A_h) ** 2).max())

Source = """
    #define complex_ctr(x, y) (float2)(x, y)
    #define complex_mul(a, b) complex_ctr((a).x * (b).x - (a).y * (b).y, (a).x * (b).y + (a).y * (b).x)
    #define complex_unit (float2)(0, 1)

    __kernel void propagate(__global float2 *A){
        const int gid_x = get_global_id(0);
        float EPS = 0.1f;
        float2 a1, a2;
        a1 = A[gid_x-1];
        a2 = A[gid_x+1];
        barrier(CLK_GLOBAL_MEM_FENCE);
        A[gid_x] += EPS*complex_mul((a1 + a2), complex_unit);
    }
"""

prg = cl.Program(ctx, Source).build()
for i in range(12000):
    print i
    evolve = prg.propagate(queue, (N,), None, A_d)
    evolve.wait()
cl.enqueue_copy(queue, A_h, A_d)

plot.plot(X, abs(A_h) ** 2)

plot.show()

DarkZeros · Accepted Answer

EDIT: oh, just read @talonmies comment, it is same solution as this.

This code is not safe in OpenCL, it has datarace problem:

A[gid_x] = A[gid_x] + EPS*complex_mul((A[gid_x+1] + A[gid_x-1]), complex_unit);

Every work item x uses x+1 and x-1. Depending on the schedule of work items the result will be different.

Use 2 buffers instead, read from A, write to B, easy:

import scipy as sp
import pyopencl as cl
import matplotlib.pyplot as plot

ctx = cl.create_some_context()
queue = cl.CommandQueue(ctx)
MF = cl.mem_flags

dx = 0.01
X = sp.arange(0.0, 100, dx)
N = len(X)

A_h = (sp.exp(-(X-50)**2/10.)*sp.exp(-1j*1000*X)).astype(sp.complex64)
A_d = cl.Buffer(ctx, MF.READ_WRITE | MF.COPY_HOST_PTR, hostbuf=A_h)
B_d = cl.Buffer(ctx, MF.READ_WRITE)

plot.figure()
plot.plot(X, abs(A_h) ** 2 / (abs(A_h) ** 2).max())

Source = """
    #define complex_ctr(x, y) (float2)(x, y)
    #define complex_mul(a, b) complex_ctr(mad(-(a).y, (b).y, (a).x * (b).x), mad((a).y, (b).x, (a).x * (b).y))
    #define complex_unit (float2)(0, 1)

    __kernel void propagate(__global float2 *A, __global float2 *B){
        const int gid_x = get_global_id(0);
        float EPS = 0.1f;
        B[gid_x] = A[gid_x] + EPS*complex_mul((A[gid_x+1] + A[gid_x-1]), complex_unit);
    }
"""
prg = cl.Program(ctx, Source).build()
for i in range(3000):
    print i
    event = prg.propagate(queue, (N,), None, A_d, B_d)
    A_d, B_d = B_d, A_d #Swap buffers, so A always has results
    #event.wait() #You don't need this, this is slowing terribly the execution, enqueue_copy already waits
cl.enqueue_copy(queue, A_h, A_d)

plot.plot(X, abs(A_h) ** 2 / (abs(A_h) ** 2).max())

plot.show()

PyOpenCL - Different results on Intel and NVidia

Answers (1)

Related Questions