Pyopencl memory access/non repetitive results

Question

PROBLEM: I see that the output varies depending on the execution. It suggests a memory access problem that I did not manage to find.

ATTEMPT: I am trying to make linear operations in a 8x8 matrix with pyopencl. The example that I am showing operates the rows of the matrix, using the first row so the first column becomes 0 (except first row). The last loop executes this function "repetitions" times and compares it to the first execution, and counts the similar values. I never get as many similar value counts as repetitions where made, and the mismatch is located always in certain area of the matrix.

You can see in the cl code that I got desperate already trying to fix the problem with memory barriers. There must be something out of my understanding going on. Python code:

from __future__ import print_function
from __future__ import absolute_import
import pyopencl as cl
import numpy as np
import os
from numpy.random import RandomState
os.environ['PYOPENCL_COMPILER_OUTPUT'] = '1'
for platform in cl.get_platforms():
    print("Platform name:", platform.name)

platform = cl.get_platforms()
platform = [x for x in platform if "AMD" in x.name][0]
device = platform.get_devices()
#    ctx = cl.create_some_context()
ctx = cl.Context(device)
queue = cl.CommandQueue(ctx)
mf = cl.mem_flags
n = 2**3
block_size = 2**2
bs = block_size
prng = RandomState(666666)
a=prng.uniform(0,100000,(n,n)) + 2
b = -a + 1
a = a.astype(np.float32)
b = b.astype(np.float32)
kernelpath = "./Stack_Overflow_pyopencl_question1.cl"
with open(kernelpath, "r") as f:
    kernel_txt = "".join(f.readlines())

def col1_0(a, b, device):
    a_buf = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=a)
    b_buf = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=b)
    dest_buf = cl.Buffer(ctx, mf.READ_WRITE, a.nbytes )
    prg = cl.Program(ctx, kernel_txt.replace("FLAG_gsize", str(n)).replace("FLAG_lsize", str(bs))).build()
    prg.col1_0(queue, a.shape, (bs,bs),np.int32(len(a)),
                         a_buf, b_buf, dest_buf)
    final = np.empty_like(a)
    cl.enqueue_copy(queue, final , dest_buf).wait()
    return final

res = col1_0(a, b, device)
print("RES
", res)
base = res
comp = np.zeros(base.shape)
repetitions = 1000
for i in range(repetitions):
    new = col1_0(a, b, device)
    temp = np.isclose(new,base,atol=2.0e-03)
    comp += temp
print(comp)

Opencl code:

__kernel void col1_0(const unsigned int size, 
__global float * a, __global float * b, __global float * res) {

__local float la[FLAG_lsize*FLAG_lsize];
__local float lb[FLAG_lsize*FLAG_lsize];
__local float lc[FLAG_lsize*FLAG_lsize];
__local float rowp[FLAG_lsize];
__local float colp[FLAG_lsize];
__local float pivot;
uint i = get_global_id(0); 
uint li = get_local_id(0);
uint gi = get_group_id(0);
uint j = get_global_id(1); 
uint lj = get_local_id(1);
uint gj = get_group_id(1);
uint size_i = get_global_size(0); 
uint size_j = get_global_size(1); 
uint lsize_i = get_local_size(0);
uint lsize_j = get_local_size(1); 
uint row = j*size_i;
uint lrow = lj*lsize_i;
uint gind = i + row;
uint lind = li + lrow;
uint rp;
uint k;
barrier(CLK_GLOBAL_MEM_FENCE|CLK_LOCAL_MEM_FENCE);
res[gind] = a[gind];

for (k=0;k<1;k++) {
    rp = k*size_i;
    barrier(CLK_GLOBAL_MEM_FENCE|CLK_LOCAL_MEM_FENCE);
    pivot = res[k + rp];
    rowp[li] = res[i + rp];
    colp[lj] = res[k + row];
    la[lind] = res[gind];
    barrier(CLK_GLOBAL_MEM_FENCE|CLK_LOCAL_MEM_FENCE);
    if (j>k){
    lc[lind] = la[lind] - rowp[li]/pivot*colp[lj];
    }else{
    lc[lind] = la[lind];
    }
    barrier(CLK_GLOBAL_MEM_FENCE|CLK_LOCAL_MEM_FENCE);
    res[gind] = lc[lind];
    barrier(CLK_GLOBAL_MEM_FENCE|CLK_LOCAL_MEM_FENCE);
}   
}

Oputput example:

 [[ 72986.04    28263.281   23286.807   21662.82    38600.445   56755.12
   13160.146   77571.305 ]
 [     0.      21457.246   25730.016   83697.62    33790.82    67593.13
    6248.0215 -13557.52  ]
 [     0.      23594.188  -10326.518   66544.16   -24266.705   -5718.115
   76904.875   23694.09  ]
 [     0.      23392.277   82200.61    74443.24    83087.48    63177.59
   50563.84    31685.52  ]
 [     0.       2005.1416   4741.216    6905.4834 -15929.7    -28064.785
   10786.973  -12347.803 ]
 [     0.      62289.426   72695.19    -9519.179   42706.625   -6567.9316
   62263.58    55469.785 ]
 [     0.      40805.617   26905.514   77325.45    -8362.551    4206.672
   78279.016   28778.395 ]
 [     0.      31332.406   78166.016   36025.945  -18576.621     372.4453
   -1757.8496 -43466.66  ]]

Example of the comparison of 1000 outputs:

[[1000. 1000. 1000. 1000. 1000. 1000. 1000. 1000.]
 [1000. 1000. 1000. 1000.  997.  997.  997.  997.]
 [1000. 1000. 1000. 1000.  998.  998.  998.  998.]
 [1000. 1000. 1000. 1000.  998.  998.  998.  998.]
 [1000. 1000. 1000. 1000.  996.  996.  996.  996.]
 [1000. 1000. 1000. 1000.  996.  996.  996.  996.]
 [1000. 1000. 1000. 1000.  996.  996.  996.  996.]
 [1000. 1000. 1000. 1000.  996.  996.  996.  996.]]

Pyopencl memory access/non repetitive results

Answers (1)

Related Questions