Reputation: 89
It seems to be a strange but very basic problem. I tried doing a simple operation in pyopencl. Given below is the code where if I multiply my positions with exp(-f_sum/sigma2)/sigma2 I get 0 (even though I have non zero values for both positions as well as sigma) but when I add the value I get the correct result.
kernelsource = """ __kernel void forceFinder(
const int N,
const int dim,
const float sigma,
const float resistant,
__global float* datacl,
__constant float* poscl,
__global float* res
)
{
int i = get_global_id(0);
float f_sum ;
int k;
float sigma2 = sigma * sigma;
float tempo;
if (i < N ) {
f_sum = 0;
for (k = 0; k < dim; k++)
{
f_sum += pown((poscl[k] - datacl[i * dim + k]), 2);
}
for (k = 0; k < dim; k++)
{
res[i * dim + k] = (datacl[i * dim + k] - poscl[k]) * exp(-f_sum/sigma2)/sigma2;
}
}
}
"""
Instead of "*" in last loop if I replace it with "+" I get the output
Upvotes: 0
Views: 108
Reputation: 93
I tried to understand what your kernel is doing by writing a complete example (see below). Although I failed to understand completely whats happening, I receive as result a 10x10-matrix with the value of -0.0024 for all entries if I run the code below on my machine. Please provide a complete example or more information what you expect as result if you need further help.
Additionally: Can you get rid of your first for-loop by using exp(x)*exp(x)=exp(x+x)?
import pyopencl as cl
import numpy as np
kernelsource = """
__kernel void forceFinder( const int dim,
const float sigma,
__global float* datacl,
__constant float* poscl,
__global float* res ){
int i = get_global_id(0);
float f_sum = 0;
float sigma2 = sigma * sigma;
for (int k = 0; k < dim; k++){
f_sum += pown((poscl[k] - datacl[i * dim + k]), 2);
}
for (int k = 0; k < dim; k++){
res[i * dim + k] = (datacl[i * dim + k] - poscl[k]) * exp(-f_sum/sigma2)/sigma2;
}
}
"""
device = cl.get_platforms()[0].get_devices()[0]
context = cl.Context([device])
program = cl.Program(context, kernelsource).build()
queue = cl.CommandQueue(context)
sigma = 20
dim = 10
N = 5
poscl_local = np.ones(dim).astype(np.float32) * 2.
datacl_local = np.ones((N,dim)).astype(np.float32)
res_local = np.zeros(datacl_local.shape).astype(np.float32)
poscl_buf = cl.Buffer(context, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=poscl_local)
datacl_buf = cl.Buffer(context, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=datacl_local)
res_buf = cl.Buffer(context, cl.mem_flags.WRITE_ONLY, res_local.nbytes)
program.forceFinder(queue,(N,), None, np.int32(dim), np.float32(sigma),datacl_buf,poscl_buf,res_buf)
cl.enqueue_copy(queue, res_local, res_buf)
print("result: {}".format(res_local))
Upvotes: 1