Reputation: 319
PROBLEM: I see that the output varies depending on the execution. It suggests a memory access problem that I did not manage to find.
ATTEMPT: I am trying to make linear operations in a 8x8 matrix with pyopencl. The example that I am showing operates the rows of the matrix, using the first row so the first column becomes 0 (except first row). The last loop executes this function "repetitions" times and compares it to the first execution, and counts the similar values. I never get as many similar value counts as repetitions where made, and the mismatch is located always in certain area of the matrix.
You can see in the cl code that I got desperate already trying to fix the problem with memory barriers. There must be something out of my understanding going on. Python code:
from __future__ import print_function
from __future__ import absolute_import
import pyopencl as cl
import numpy as np
import os
from numpy.random import RandomState
os.environ['PYOPENCL_COMPILER_OUTPUT'] = '1'
for platform in cl.get_platforms():
print("Platform name:", platform.name)
platform = cl.get_platforms()
platform = [x for x in platform if "AMD" in x.name][0]
device = platform.get_devices()
# ctx = cl.create_some_context()
ctx = cl.Context(device)
queue = cl.CommandQueue(ctx)
mf = cl.mem_flags
n = 2**3
block_size = 2**2
bs = block_size
prng = RandomState(666666)
a=prng.uniform(0,100000,(n,n)) + 2
b = -a + 1
a = a.astype(np.float32)
b = b.astype(np.float32)
kernelpath = "./Stack_Overflow_pyopencl_question1.cl"
with open(kernelpath, "r") as f:
kernel_txt = "".join(f.readlines())
def col1_0(a, b, device):
a_buf = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=a)
b_buf = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=b)
dest_buf = cl.Buffer(ctx, mf.READ_WRITE, a.nbytes )
prg = cl.Program(ctx, kernel_txt.replace("FLAG_gsize", str(n)).replace("FLAG_lsize", str(bs))).build()
prg.col1_0(queue, a.shape, (bs,bs),np.int32(len(a)),
a_buf, b_buf, dest_buf)
final = np.empty_like(a)
cl.enqueue_copy(queue, final , dest_buf).wait()
return final
res = col1_0(a, b, device)
print("RES\n", res)
base = res
comp = np.zeros(base.shape)
repetitions = 1000
for i in range(repetitions):
new = col1_0(a, b, device)
temp = np.isclose(new,base,atol=2.0e-03)
comp += temp
print(comp)
Opencl code:
__kernel void col1_0(const unsigned int size,
__global float * a, __global float * b, __global float * res) {
__local float la[FLAG_lsize*FLAG_lsize];
__local float lb[FLAG_lsize*FLAG_lsize];
__local float lc[FLAG_lsize*FLAG_lsize];
__local float rowp[FLAG_lsize];
__local float colp[FLAG_lsize];
__local float pivot;
uint i = get_global_id(0);
uint li = get_local_id(0);
uint gi = get_group_id(0);
uint j = get_global_id(1);
uint lj = get_local_id(1);
uint gj = get_group_id(1);
uint size_i = get_global_size(0);
uint size_j = get_global_size(1);
uint lsize_i = get_local_size(0);
uint lsize_j = get_local_size(1);
uint row = j*size_i;
uint lrow = lj*lsize_i;
uint gind = i + row;
uint lind = li + lrow;
uint rp;
uint k;
barrier(CLK_GLOBAL_MEM_FENCE|CLK_LOCAL_MEM_FENCE);
res[gind] = a[gind];
for (k=0;k<1;k++) {
rp = k*size_i;
barrier(CLK_GLOBAL_MEM_FENCE|CLK_LOCAL_MEM_FENCE);
pivot = res[k + rp];
rowp[li] = res[i + rp];
colp[lj] = res[k + row];
la[lind] = res[gind];
barrier(CLK_GLOBAL_MEM_FENCE|CLK_LOCAL_MEM_FENCE);
if (j>k){
lc[lind] = la[lind] - rowp[li]/pivot*colp[lj];
}else{
lc[lind] = la[lind];
}
barrier(CLK_GLOBAL_MEM_FENCE|CLK_LOCAL_MEM_FENCE);
res[gind] = lc[lind];
barrier(CLK_GLOBAL_MEM_FENCE|CLK_LOCAL_MEM_FENCE);
}
}
Oputput example:
[[ 72986.04 28263.281 23286.807 21662.82 38600.445 56755.12
13160.146 77571.305 ]
[ 0. 21457.246 25730.016 83697.62 33790.82 67593.13
6248.0215 -13557.52 ]
[ 0. 23594.188 -10326.518 66544.16 -24266.705 -5718.115
76904.875 23694.09 ]
[ 0. 23392.277 82200.61 74443.24 83087.48 63177.59
50563.84 31685.52 ]
[ 0. 2005.1416 4741.216 6905.4834 -15929.7 -28064.785
10786.973 -12347.803 ]
[ 0. 62289.426 72695.19 -9519.179 42706.625 -6567.9316
62263.58 55469.785 ]
[ 0. 40805.617 26905.514 77325.45 -8362.551 4206.672
78279.016 28778.395 ]
[ 0. 31332.406 78166.016 36025.945 -18576.621 372.4453
-1757.8496 -43466.66 ]]
Example of the comparison of 1000 outputs:
[[1000. 1000. 1000. 1000. 1000. 1000. 1000. 1000.]
[1000. 1000. 1000. 1000. 997. 997. 997. 997.]
[1000. 1000. 1000. 1000. 998. 998. 998. 998.]
[1000. 1000. 1000. 1000. 998. 998. 998. 998.]
[1000. 1000. 1000. 1000. 996. 996. 996. 996.]
[1000. 1000. 1000. 1000. 996. 996. 996. 996.]
[1000. 1000. 1000. 1000. 996. 996. 996. 996.]
[1000. 1000. 1000. 1000. 996. 996. 996. 996.]]
Upvotes: 0
Views: 387
Reputation: 133
You are pretty much filling up your device's memory by repeatedly instancing buffers and program objects in your col1_0 function.
With that said, try this version of your python script:
import numpy as np
import os
from numpy.random import RandomState
os.environ['PYOPENCL_COMPILER_OUTPUT'] = '1'
for platform in cl.get_platforms():
print("Platform name:", platform.name)
platform = cl.get_platforms()
platform = [x for x in platform if "NVIDIA" in x.name][0]
device = platform.get_devices()
# ctx = cl.create_some_context()
ctx = cl.Context(device)
queue = cl.CommandQueue(ctx)
mf = cl.mem_flags
n = 2**3
block_size = 2**2
bs = block_size
prng = RandomState(666666)
a=prng.uniform(0,100000,(n,n)) + 2
b = -a + 1
a = a.astype(np.float32)
b = b.astype(np.float32)
kernelpath = "./Stack_Overflow_pyopencl_question1.cl"
with open(kernelpath, "r") as f:
kernel_txt = "".join(f.readlines())
prg = cl.Program(ctx, kernel_txt.replace("FLAG_gsize", str(n)).replace("FLAG_lsize", str(bs))).build()
a_buf = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=a)
b_buf = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=b)
def col1_0(a, b, device):
dest_buf = cl.Buffer(ctx, mf.READ_WRITE, a.nbytes )
wait = prg.col1_0(queue, a.shape, (bs,bs),np.int32(len(a)), a_buf, b_buf, dest_buf)
final = np.empty_like(a)
cl.enqueue_copy(queue, final , dest_buf, wait_for=[wait]).wait()
'''
a_buf.release()
b_buf.release()
'''
dest_buf.release()
return final
res = col1_0(a, b, device)
print("RES\n", res)
base = res
comp = np.zeros(base.shape)
repetitions = 1000000
for i in range(repetitions):
new = col1_0(a, b, device)
temp = np.isclose(new,base,atol=2.0e-03)
comp += temp
print(comp)
PS: You don't have to replace variables in your kernel code by parsing, there is an option for adding build options for the just in time compiler to the build() call:
options = " -DFLAG_lsize="+str(bs)
options += " -DFLAG_gsize="+str(n)
prg = cl.Program(ctx, kernel_txt).build(options=options)
Edit: Updated script to run 1,000,000 times. Results follow:
Platform name: NVIDIA CUDA
RES
[[ 72986.0390625 28263.28125 23286.80664062 21662.8203125
38600.4453125 56755.12109375 13160.14648438 77571.3046875 ]
[ 0. 21457.24609375 25730.015625 83697.6171875
33790.8203125 67593.1328125 6248.02148438 -13557.51855469]
[ 0. 23594.1875 -10326.51855469 66544.15625
-24266.70507812 -5718.11376953 76904.875 23694.08984375]
[ 0. 23392.27734375 82200.609375 74443.2421875
83087.4765625 63177.58984375 50563.83984375 31685.51953125]
[ 0. 2005.14135742 4741.21533203 6905.48291016
-15929.70019531 -28064.78710938 10786.97265625 -12347.80371094]
[ 0. 62289.42578125 72695.1875 -9519.17871094
42706.625 -6567.93164062 62263.578125 55469.78515625]
[ 0. 40805.6171875 26905.51367188 77325.453125
-8362.55078125 4206.67138672 78279.015625 28778.39453125]
[ 0. 31332.40625 78166.015625 36025.94921875
-18576.62109375 372.44696045 -1757.84912109 -43466.66015625]]
[[ 1000000. 1000000. 1000000. 1000000. 1000000. 1000000. 1000000.
1000000.]
[ 1000000. 1000000. 1000000. 1000000. 1000000. 1000000. 1000000.
1000000.]
[ 1000000. 1000000. 1000000. 1000000. 1000000. 1000000. 1000000.
1000000.]
[ 1000000. 1000000. 1000000. 1000000. 1000000. 1000000. 1000000.
1000000.]
[ 1000000. 1000000. 1000000. 1000000. 1000000. 1000000. 1000000.
1000000.]
[ 1000000. 1000000. 1000000. 1000000. 1000000. 1000000. 1000000.
1000000.]
[ 1000000. 1000000. 1000000. 1000000. 1000000. 1000000. 1000000.
1000000.]
[ 1000000. 1000000. 1000000. 1000000. 1000000. 1000000. 1000000.
1000000.]]
Upvotes: 1