Mantxu
Mantxu

Reputation: 319

Pyopencl memory access/non repetitive results

PROBLEM: I see that the output varies depending on the execution. It suggests a memory access problem that I did not manage to find.

ATTEMPT: I am trying to make linear operations in a 8x8 matrix with pyopencl. The example that I am showing operates the rows of the matrix, using the first row so the first column becomes 0 (except first row). The last loop executes this function "repetitions" times and compares it to the first execution, and counts the similar values. I never get as many similar value counts as repetitions where made, and the mismatch is located always in certain area of the matrix.

You can see in the cl code that I got desperate already trying to fix the problem with memory barriers. There must be something out of my understanding going on. Python code:

from __future__ import print_function
from __future__ import absolute_import
import pyopencl as cl
import numpy as np
import os
from numpy.random import RandomState
os.environ['PYOPENCL_COMPILER_OUTPUT'] = '1'
for platform in cl.get_platforms():
    print("Platform name:", platform.name)

platform = cl.get_platforms()
platform = [x for x in platform if "AMD" in x.name][0]
device = platform.get_devices()
#    ctx = cl.create_some_context()
ctx = cl.Context(device)
queue = cl.CommandQueue(ctx)
mf = cl.mem_flags
n = 2**3
block_size = 2**2
bs = block_size
prng = RandomState(666666)
a=prng.uniform(0,100000,(n,n)) + 2
b = -a + 1
a = a.astype(np.float32)
b = b.astype(np.float32)
kernelpath = "./Stack_Overflow_pyopencl_question1.cl"
with open(kernelpath, "r") as f:
    kernel_txt = "".join(f.readlines())

def col1_0(a, b, device):
    a_buf = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=a)
    b_buf = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=b)
    dest_buf = cl.Buffer(ctx, mf.READ_WRITE, a.nbytes )
    prg = cl.Program(ctx, kernel_txt.replace("FLAG_gsize", str(n)).replace("FLAG_lsize", str(bs))).build()
    prg.col1_0(queue, a.shape, (bs,bs),np.int32(len(a)),
                         a_buf, b_buf, dest_buf)
    final = np.empty_like(a)
    cl.enqueue_copy(queue, final , dest_buf).wait()
    return final

res = col1_0(a, b, device)
print("RES\n", res)
base = res
comp = np.zeros(base.shape)
repetitions = 1000
for i in range(repetitions):
    new = col1_0(a, b, device)
    temp = np.isclose(new,base,atol=2.0e-03)
    comp += temp
print(comp)

Opencl code:

__kernel void col1_0(const unsigned int size, 
__global float * a, __global float * b, __global float * res) {

__local float la[FLAG_lsize*FLAG_lsize];
__local float lb[FLAG_lsize*FLAG_lsize];
__local float lc[FLAG_lsize*FLAG_lsize];
__local float rowp[FLAG_lsize];
__local float colp[FLAG_lsize];
__local float pivot;
uint i = get_global_id(0); 
uint li = get_local_id(0);
uint gi = get_group_id(0);
uint j = get_global_id(1); 
uint lj = get_local_id(1);
uint gj = get_group_id(1);
uint size_i = get_global_size(0); 
uint size_j = get_global_size(1); 
uint lsize_i = get_local_size(0);
uint lsize_j = get_local_size(1); 
uint row = j*size_i;
uint lrow = lj*lsize_i;
uint gind = i + row;
uint lind = li + lrow;
uint rp;
uint k;
barrier(CLK_GLOBAL_MEM_FENCE|CLK_LOCAL_MEM_FENCE);
res[gind] = a[gind];

for (k=0;k<1;k++) {
    rp = k*size_i;
    barrier(CLK_GLOBAL_MEM_FENCE|CLK_LOCAL_MEM_FENCE);
    pivot = res[k + rp];
    rowp[li] = res[i + rp];
    colp[lj] = res[k + row];
    la[lind] = res[gind];
    barrier(CLK_GLOBAL_MEM_FENCE|CLK_LOCAL_MEM_FENCE);
    if (j>k){
    lc[lind] = la[lind] - rowp[li]/pivot*colp[lj];
    }else{
    lc[lind] = la[lind];
    }
    barrier(CLK_GLOBAL_MEM_FENCE|CLK_LOCAL_MEM_FENCE);
    res[gind] = lc[lind];
    barrier(CLK_GLOBAL_MEM_FENCE|CLK_LOCAL_MEM_FENCE);
}   
}

Oputput example:

 [[ 72986.04    28263.281   23286.807   21662.82    38600.445   56755.12
   13160.146   77571.305 ]
 [     0.      21457.246   25730.016   83697.62    33790.82    67593.13
    6248.0215 -13557.52  ]
 [     0.      23594.188  -10326.518   66544.16   -24266.705   -5718.115
   76904.875   23694.09  ]
 [     0.      23392.277   82200.61    74443.24    83087.48    63177.59
   50563.84    31685.52  ]
 [     0.       2005.1416   4741.216    6905.4834 -15929.7    -28064.785
   10786.973  -12347.803 ]
 [     0.      62289.426   72695.19    -9519.179   42706.625   -6567.9316
   62263.58    55469.785 ]
 [     0.      40805.617   26905.514   77325.45    -8362.551    4206.672
   78279.016   28778.395 ]
 [     0.      31332.406   78166.016   36025.945  -18576.621     372.4453
   -1757.8496 -43466.66  ]]

Example of the comparison of 1000 outputs:

[[1000. 1000. 1000. 1000. 1000. 1000. 1000. 1000.]
 [1000. 1000. 1000. 1000.  997.  997.  997.  997.]
 [1000. 1000. 1000. 1000.  998.  998.  998.  998.]
 [1000. 1000. 1000. 1000.  998.  998.  998.  998.]
 [1000. 1000. 1000. 1000.  996.  996.  996.  996.]
 [1000. 1000. 1000. 1000.  996.  996.  996.  996.]
 [1000. 1000. 1000. 1000.  996.  996.  996.  996.]
 [1000. 1000. 1000. 1000.  996.  996.  996.  996.]]

Upvotes: 0

Views: 387

Answers (1)

denvercoder
denvercoder

Reputation: 133

You are pretty much filling up your device's memory by repeatedly instancing buffers and program objects in your col1_0 function.

  • Try to build your program only once and reuse buffers more often.
  • Try to release buffers that aren't used any more. Beware, that release() does not instantaneously release your memory, but only tells your driver to do that when it gets around to do it. Some might respond instantly some might not.

With that said, try this version of your python script:

import numpy as np
import os
from numpy.random import RandomState
os.environ['PYOPENCL_COMPILER_OUTPUT'] = '1'
for platform in cl.get_platforms():
    print("Platform name:", platform.name)

platform = cl.get_platforms()
platform = [x for x in platform if "NVIDIA" in x.name][0]
device = platform.get_devices()
#    ctx = cl.create_some_context()
ctx = cl.Context(device)
queue = cl.CommandQueue(ctx)
mf = cl.mem_flags
n = 2**3
block_size = 2**2
bs = block_size
prng = RandomState(666666)
a=prng.uniform(0,100000,(n,n)) + 2
b = -a + 1
a = a.astype(np.float32)
b = b.astype(np.float32)
kernelpath = "./Stack_Overflow_pyopencl_question1.cl"
with open(kernelpath, "r") as f:
    kernel_txt = "".join(f.readlines())
prg = cl.Program(ctx, kernel_txt.replace("FLAG_gsize", str(n)).replace("FLAG_lsize", str(bs))).build()
a_buf = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=a)
b_buf = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=b)

def col1_0(a, b, device):

    dest_buf = cl.Buffer(ctx, mf.READ_WRITE, a.nbytes )

    wait = prg.col1_0(queue, a.shape, (bs,bs),np.int32(len(a)), a_buf, b_buf, dest_buf)
    final = np.empty_like(a)
    cl.enqueue_copy(queue, final , dest_buf, wait_for=[wait]).wait()

    '''
    a_buf.release()
    b_buf.release()
    '''
    dest_buf.release()

    return final

res = col1_0(a, b, device)
print("RES\n", res)
base = res
comp = np.zeros(base.shape)
repetitions = 1000000
for i in range(repetitions):
    new = col1_0(a, b, device)
    temp = np.isclose(new,base,atol=2.0e-03)
    comp += temp
print(comp)

PS: You don't have to replace variables in your kernel code by parsing, there is an option for adding build options for the just in time compiler to the build() call:

    options = " -DFLAG_lsize="+str(bs)
    options += " -DFLAG_gsize="+str(n)
    prg = cl.Program(ctx, kernel_txt).build(options=options)

Edit: Updated script to run 1,000,000 times. Results follow:

Platform name: NVIDIA CUDA
RES
 [[ 72986.0390625   28263.28125     23286.80664062  21662.8203125
   38600.4453125   56755.12109375  13160.14648438  77571.3046875 ]
 [     0.          21457.24609375  25730.015625    83697.6171875
   33790.8203125   67593.1328125    6248.02148438 -13557.51855469]
 [     0.          23594.1875     -10326.51855469  66544.15625
  -24266.70507812  -5718.11376953  76904.875       23694.08984375]
 [     0.          23392.27734375  82200.609375    74443.2421875
   83087.4765625   63177.58984375  50563.83984375  31685.51953125]
 [     0.           2005.14135742   4741.21533203   6905.48291016
  -15929.70019531 -28064.78710938  10786.97265625 -12347.80371094]
 [     0.          62289.42578125  72695.1875      -9519.17871094
   42706.625       -6567.93164062  62263.578125    55469.78515625]
 [     0.          40805.6171875   26905.51367188  77325.453125
   -8362.55078125   4206.67138672  78279.015625    28778.39453125]
 [     0.          31332.40625     78166.015625    36025.94921875
  -18576.62109375    372.44696045  -1757.84912109 -43466.66015625]]
[[ 1000000.  1000000.  1000000.  1000000.  1000000.  1000000.  1000000.
   1000000.]
 [ 1000000.  1000000.  1000000.  1000000.  1000000.  1000000.  1000000.
   1000000.]
 [ 1000000.  1000000.  1000000.  1000000.  1000000.  1000000.  1000000.
   1000000.]
 [ 1000000.  1000000.  1000000.  1000000.  1000000.  1000000.  1000000.
   1000000.]
 [ 1000000.  1000000.  1000000.  1000000.  1000000.  1000000.  1000000.
   1000000.]
 [ 1000000.  1000000.  1000000.  1000000.  1000000.  1000000.  1000000.
   1000000.]
 [ 1000000.  1000000.  1000000.  1000000.  1000000.  1000000.  1000000.
   1000000.]
 [ 1000000.  1000000.  1000000.  1000000.  1000000.  1000000.  1000000.
   1000000.]]

Upvotes: 1

Related Questions