Reputation: 75
i am currently working on a sha256 kernel in OpenCL. I am having trouble reading the output of the kernel if i write values to it via the kernel. If i don't write any value to the variable the code works without issues. I tried to write diffrent values to the variable but i am still having no success. This killed almost 3 days of my freetime, i hope you can help me solving this before i lose my mind.
this is my dependency.
<!-- https://mvnrepository.com/artifact/org.jocl/jocl -->
<dependency>
<groupId>org.jocl</groupId>
<artifactId>jocl</artifactId>
<version>2.0.5</version>
</dependency>
i also added the JOCL Struct Jar from the Utils section of this page: http://www.jocl.org/utilities/utilities.html
this is my Main class
package org.joclforce;
import org.jocl.*;
import org.jocl.struct.Buffers;
import org.jocl.struct.CLTypes;
import org.jocl.struct.SizeofStruct;
import org.joclforce.Structs.WorkerInput;
import org.joclforce.Structs.WorkerOutput;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.nio.ByteBuffer;
import java.util.stream.Collectors;
import static org.jocl.CL.*;
import static org.jocl.CL.clCreateKernel;
public class Main {
private static String programSource = "";
// CL state
private static cl_context context;
private static cl_command_queue commandQueue;
private static cl_kernel kernel;
private static cl_program program;
public Main() {
try {
programSource = Main.getResourceFileAsString("sha256.cl");
} catch (IOException e) {
throw new RuntimeException(e);
}
startKernel();
}
private void startKernel() {
defaultInitialization();
// Initialization of an array containing some inputs
int n = 3;
WorkerInput[] workerInputs = new WorkerInput[n];
WorkerOutput[] workerOutputs = new WorkerOutput[n];
for (int i=0; i<n; i++)
{
workerInputs[i] = new WorkerInput();
workerOutputs[i] = new WorkerOutput();
workerInputs[i].message = new CLTypes.cl_uint[8];
workerOutputs[i].digest = new CLTypes.cl_uint[8];
for( int c = 0; c < 8; c++ ) {
workerInputs[i].message[c] = new CLTypes.cl_uint();
workerOutputs[i].digest[c] = new CLTypes.cl_uint();
workerInputs[i].message[c].set(0);
workerOutputs[i].digest[c].set(0);
}
}
int structSizeInput = SizeofStruct.sizeof(WorkerInput.class);
int structSizeOutput = SizeofStruct.sizeof(WorkerOutput.class);
// Allocate a buffer that can store the data
ByteBuffer inputBuffer = Buffers.allocateBuffer(workerInputs);
ByteBuffer outputBuffer = Buffers.allocateBuffer(workerOutputs);
// Write the data into the buffer
Buffers.writeToBuffer(inputBuffer, workerInputs);
Buffers.writeToBuffer(outputBuffer, workerOutputs);
// Allocate the memory object for the data that
// contains the data from the struct buffer
cl_mem inputsMem = clCreateBuffer(context,
CL_MEM_READ_WRITE | CL_MEM_USE_HOST_PTR,
(long) structSizeInput * n, Pointer.to(inputBuffer), null);
cl_mem outputsMem = clCreateBuffer(context,
CL_MEM_READ_WRITE,
(long) structSizeOutput * n, Pointer.to(outputBuffer), null);
// Set the arguments for the kernel
clSetKernelArg(kernel, 0, Sizeof.cl_mem, Pointer.to(inputsMem));
clSetKernelArg(kernel, 1, Sizeof.cl_mem, Pointer.to(outputBuffer));
// Set the work-item dimensions
long global_work_size[] = new long[]{n};
// Execute the kernel
clEnqueueNDRangeKernel(commandQueue, kernel, 1, null,
global_work_size, null, 0, null, null);
// Read back the data from to memory object to the object buffer
clEnqueueReadBuffer(commandQueue, outputsMem, true, 0,
(long) structSizeOutput * n, Pointer.to(outputBuffer), 0 , null, null);
// Read the data from the object buffer back into the object
outputBuffer.rewind();
Buffers.readFromBuffer(outputBuffer, workerOutputs);
// Print the result
for (int i=0; i<n; i++)
{
System.out.println(i+": "+workerOutputs[i]);
}
// Clean up
clReleaseMemObject(inputsMem);
clReleaseMemObject(outputsMem);
clReleaseKernel(kernel);
clReleaseProgram(program);
clReleaseCommandQueue(commandQueue);
clReleaseContext(context);
}
private void defaultInitialization()
{
// Obtain the platform IDs and initialize the context properties
cl_platform_id platforms[] = new cl_platform_id[1];
clGetPlatformIDs(platforms.length, platforms, null);
cl_context_properties contextProperties = new cl_context_properties();
contextProperties.addProperty(CL_CONTEXT_PLATFORM, platforms[0]);
// Create an OpenCL context on a GPU device
context = clCreateContextFromType(
contextProperties, CL_DEVICE_TYPE_GPU, null, null, null);
if (context == null)
{
// If no context for a GPU device could be created,
// try to create one for a CPU device.
context = clCreateContextFromType(
contextProperties, CL_DEVICE_TYPE_CPU, null, null, null);
if (context == null)
{
System.out.println("Unable to create a context");
return;
}
}
// Enable exceptions and subsequently omit error checks in this sample
CL.setExceptionsEnabled(true);
// Get the list of GPU devices associated with the context
long numBytes[] = new long[1];
clGetContextInfo(context, CL_CONTEXT_DEVICES, 0, null, numBytes);
// Obtain the cl_device_id for the first device
int numDevices = (int) numBytes[0] / Sizeof.cl_device_id;
cl_device_id devices[] = new cl_device_id[numDevices];
clGetContextInfo(context, CL_CONTEXT_DEVICES, numBytes[0],
Pointer.to(devices), null);
// Create a command-queue
commandQueue =
clCreateCommandQueue(context, devices[0], 0, null);
// Create the program from the source code
program = clCreateProgramWithSource(context,
1, new String[]{ programSource }, null, null);
// Build the program
clBuildProgram(program, 0, null, null, null, null);
// Create the kernel
kernel = clCreateKernel(program, "sha256", null);
}
public static void main(String[] args) {
Main main = new Main();
}
/**
* Reads given resource file as a string.
*
* @param fileName path to the resource file
* @return the file's contents
* @throws IOException if read fails for any reason
*/
static String getResourceFileAsString(String fileName) throws IOException {
ClassLoader classLoader = ClassLoader.getSystemClassLoader();
try (InputStream is = classLoader.getResourceAsStream(fileName)) {
if (is == null) return null;
try (InputStreamReader isr = new InputStreamReader(is);
BufferedReader reader = new BufferedReader(isr)) {
return reader.lines().collect(Collectors.joining(System.lineSeparator()));
}
}
}
}
this are my two structs for the parameters:
package org.joclforce.Structs;
import org.jocl.struct.ArrayLength;
import org.jocl.struct.CLTypes.*;
import org.jocl.struct.Struct;
import java.util.Arrays;
public class WorkerInput extends Struct {
@ArrayLength(8)
public cl_uint[] message;
public String toString()
{
return "WorkerInput[" +
"message="+ Arrays.toString(message)+"]";
}
}
and
package org.joclforce.Structs;
import org.jocl.struct.ArrayLength;
import org.jocl.struct.CLTypes;
import org.jocl.struct.Struct;
import java.util.Arrays;
public class WorkerOutput extends Struct {
@ArrayLength(8)
public CLTypes.cl_uint[] digest;
public String toString()
{
return "WorkerOutput[" +
"digest="+ Arrays.toString(digest)+"]";
}
}
and finaly this is my kernel that i use
#ifndef uint8
#define uint8 unsigned char
#endif
#ifndef uint32
#define uint32 unsigned int
#endif
typedef struct {
uint32 state[8];
} sha256_context;
typedef struct {
uint32 message[8];
} sha256_input;
typedef struct {
uint32 digest[8];
} sha256_output;
#define GET_UINT32(n,b,i) \
{ \
(n) = ( (uint32) (b)[(i) ] << 24 ) \
| ( (uint32) (b)[(i) + 1] << 16 ) \
| ( (uint32) (b)[(i) + 2] << 8 ) \
| ( (uint32) (b)[(i) + 3] ); \
}
#define PUT_UINT32(n,b,i) \
{ \
(b)[(i) ] = (uint8) ( (n) >> 24 ); \
(b)[(i) + 1] = (uint8) ( (n) >> 16 ); \
(b)[(i) + 2] = (uint8) ( (n) >> 8 ); \
(b)[(i) + 3] = (uint8) ( (n) ); \
}
inline void sha256_starts(sha256_context * ctx)
{
ctx->state[0] = 0x6A09E667;
ctx->state[1] = 0xBB67AE85;
ctx->state[2] = 0x3C6EF372;
ctx->state[3] = 0xA54FF53A;
ctx->state[4] = 0x510E527F;
ctx->state[5] = 0x9B05688C;
ctx->state[6] = 0x1F83D9AB;
ctx->state[7] = 0x5BE0CD19;
}
inline void sha256_process(sha256_context * ctx, uint8 data[64])
{
uint32 temp1, temp2, W[64];
uint32 A, B, C, D, E, F, G, H;
GET_UINT32(W[0], data, 0);
GET_UINT32(W[1], data, 4);
GET_UINT32(W[2], data, 8);
GET_UINT32(W[3], data, 12);
GET_UINT32(W[4], data, 16);
GET_UINT32(W[5], data, 20);
GET_UINT32(W[6], data, 24);
GET_UINT32(W[7], data, 28);
GET_UINT32(W[8], data, 32);
GET_UINT32(W[9], data, 36);
GET_UINT32(W[10], data, 40);
GET_UINT32(W[11], data, 44);
GET_UINT32(W[12], data, 48);
GET_UINT32(W[13], data, 52);
GET_UINT32(W[14], data, 56);
GET_UINT32(W[15], data, 60);
#define SHR(x,n) ((x & 0xFFFFFFFF) >> n)
#define ROTR(x,n) (SHR(x,n) | (x << (32 - n)))
#define S0(x) (ROTR(x, 7) ^ ROTR(x,18) ^ SHR(x, 3))
#define S1(x) (ROTR(x,17) ^ ROTR(x,19) ^ SHR(x,10))
#define S2(x) (ROTR(x, 2) ^ ROTR(x,13) ^ ROTR(x,22))
#define S3(x) (ROTR(x, 6) ^ ROTR(x,11) ^ ROTR(x,25))
#define F0(x,y,z) ((x & y) | (z & (x | y)))
#define F1(x,y,z) (z ^ (x & (y ^ z)))
#define R(t) \
( \
W[t] = S1(W[t - 2]) + W[t - 7] + \
S0(W[t - 15]) + W[t - 16] \
)
#define P(a,b,c,d,e,f,g,h,x,K) \
{ \
temp1 = h + S3(e) + F1(e,f,g) + K + x; \
temp2 = S2(a) + F0(a,b,c); \
d += temp1; h = temp1 + temp2; \
}
A = ctx->state[0];
B = ctx->state[1];
C = ctx->state[2];
D = ctx->state[3];
E = ctx->state[4];
F = ctx->state[5];
G = ctx->state[6];
H = ctx->state[7];
P(A, B, C, D, E, F, G, H, W[0], 0x428A2F98);
P(H, A, B, C, D, E, F, G, W[1], 0x71374491);
P(G, H, A, B, C, D, E, F, W[2], 0xB5C0FBCF);
P(F, G, H, A, B, C, D, E, W[3], 0xE9B5DBA5);
P(E, F, G, H, A, B, C, D, W[4], 0x3956C25B);
P(D, E, F, G, H, A, B, C, W[5], 0x59F111F1);
P(C, D, E, F, G, H, A, B, W[6], 0x923F82A4);
P(B, C, D, E, F, G, H, A, W[7], 0xAB1C5ED5);
P(A, B, C, D, E, F, G, H, W[8], 0xD807AA98);
P(H, A, B, C, D, E, F, G, W[9], 0x12835B01);
P(G, H, A, B, C, D, E, F, W[10], 0x243185BE);
P(F, G, H, A, B, C, D, E, W[11], 0x550C7DC3);
P(E, F, G, H, A, B, C, D, W[12], 0x72BE5D74);
P(D, E, F, G, H, A, B, C, W[13], 0x80DEB1FE);
P(C, D, E, F, G, H, A, B, W[14], 0x9BDC06A7);
P(B, C, D, E, F, G, H, A, W[15], 0xC19BF174);
P(A, B, C, D, E, F, G, H, R(16), 0xE49B69C1);
P(H, A, B, C, D, E, F, G, R(17), 0xEFBE4786);
P(G, H, A, B, C, D, E, F, R(18), 0x0FC19DC6);
P(F, G, H, A, B, C, D, E, R(19), 0x240CA1CC);
P(E, F, G, H, A, B, C, D, R(20), 0x2DE92C6F);
P(D, E, F, G, H, A, B, C, R(21), 0x4A7484AA);
P(C, D, E, F, G, H, A, B, R(22), 0x5CB0A9DC);
P(B, C, D, E, F, G, H, A, R(23), 0x76F988DA);
P(A, B, C, D, E, F, G, H, R(24), 0x983E5152);
P(H, A, B, C, D, E, F, G, R(25), 0xA831C66D);
P(G, H, A, B, C, D, E, F, R(26), 0xB00327C8);
P(F, G, H, A, B, C, D, E, R(27), 0xBF597FC7);
P(E, F, G, H, A, B, C, D, R(28), 0xC6E00BF3);
P(D, E, F, G, H, A, B, C, R(29), 0xD5A79147);
P(C, D, E, F, G, H, A, B, R(30), 0x06CA6351);
P(B, C, D, E, F, G, H, A, R(31), 0x14292967);
P(A, B, C, D, E, F, G, H, R(32), 0x27B70A85);
P(H, A, B, C, D, E, F, G, R(33), 0x2E1B2138);
P(G, H, A, B, C, D, E, F, R(34), 0x4D2C6DFC);
P(F, G, H, A, B, C, D, E, R(35), 0x53380D13);
P(E, F, G, H, A, B, C, D, R(36), 0x650A7354);
P(D, E, F, G, H, A, B, C, R(37), 0x766A0ABB);
P(C, D, E, F, G, H, A, B, R(38), 0x81C2C92E);
P(B, C, D, E, F, G, H, A, R(39), 0x92722C85);
P(A, B, C, D, E, F, G, H, R(40), 0xA2BFE8A1);
P(H, A, B, C, D, E, F, G, R(41), 0xA81A664B);
P(G, H, A, B, C, D, E, F, R(42), 0xC24B8B70);
P(F, G, H, A, B, C, D, E, R(43), 0xC76C51A3);
P(E, F, G, H, A, B, C, D, R(44), 0xD192E819);
P(D, E, F, G, H, A, B, C, R(45), 0xD6990624);
P(C, D, E, F, G, H, A, B, R(46), 0xF40E3585);
P(B, C, D, E, F, G, H, A, R(47), 0x106AA070);
P(A, B, C, D, E, F, G, H, R(48), 0x19A4C116);
P(H, A, B, C, D, E, F, G, R(49), 0x1E376C08);
P(G, H, A, B, C, D, E, F, R(50), 0x2748774C);
P(F, G, H, A, B, C, D, E, R(51), 0x34B0BCB5);
P(E, F, G, H, A, B, C, D, R(52), 0x391C0CB3);
P(D, E, F, G, H, A, B, C, R(53), 0x4ED8AA4A);
P(C, D, E, F, G, H, A, B, R(54), 0x5B9CCA4F);
P(B, C, D, E, F, G, H, A, R(55), 0x682E6FF3);
P(A, B, C, D, E, F, G, H, R(56), 0x748F82EE);
P(H, A, B, C, D, E, F, G, R(57), 0x78A5636F);
P(G, H, A, B, C, D, E, F, R(58), 0x84C87814);
P(F, G, H, A, B, C, D, E, R(59), 0x8CC70208);
P(E, F, G, H, A, B, C, D, R(60), 0x90BEFFFA);
P(D, E, F, G, H, A, B, C, R(61), 0xA4506CEB);
P(C, D, E, F, G, H, A, B, R(62), 0xBEF9A3F7);
P(B, C, D, E, F, G, H, A, R(63), 0xC67178F2);
ctx->state[0] += A;
ctx->state[1] += B;
ctx->state[2] += C;
ctx->state[3] += D;
ctx->state[4] += E;
ctx->state[5] += F;
ctx->state[6] += G;
ctx->state[7] += H;
}
/* Write a 32-bit big-endian long value to a buffer. */
inline void WriteLong(unsigned char *buf, int value)
{
buf[0] = (unsigned char)(value >> 24);
buf[1] = (unsigned char)(value >> 16);
buf[2] = (unsigned char)(value >> 8);
buf[3] = (unsigned char)value;
}
inline void sha256_update(sha256_context * ctx, uint8 * input)
{
int i;
unsigned char buffer[64] = { 0 };
for(i = 0; input[i]; i++)
buffer[i] = input[i];
char *p = (char *) buffer;
for (i = 0; i != 64 && p[i]; i++);
p[i] = 0x80;
unsigned long tbl = i << 3;
WriteLong(buffer + 56, (unsigned int)(tbl >> 32));
WriteLong(buffer + 60, (unsigned int)tbl);
sha256_process(ctx, buffer);
}
__kernel void sha256(__global sha256_input * input, __global sha256_output * output)
{
int gid = get_global_id(0);
uint key[16] = { 0 };
int i;
for (i = 0; input[gid].message[i]; i++)
key[i] = input[gid].message[i];
sha256_context ctx;
sha256_starts(&ctx);
sha256_update(&ctx, (uint8 *) key);
printf("%u\n", ctx.state[0]);
output[gid].digest[0] = ctx.state[0]; //Line 219 Producing the Error
/**
output[gid].digest[0] = ctx.state[0];
output[gid].digest[1] = ctx.state[1];
output[gid].digest[2] = ctx.state[2];
output[gid].digest[3] = ctx.state[3];
output[gid].digest[4] = ctx.state[4];
output[gid].digest[5] = ctx.state[5];
output[gid].digest[6] = ctx.state[6];
output[gid].digest[7] = ctx.state[7];
**/
}
If you comment out line 219 in the kernel the error will disapear. I tried using diffrent ways to write values in line 219 but with no success.
Thanks for anyway who has the sanity to help me with this
Upvotes: 0
Views: 100
Reputation: 75
i solved the issue by myself, i just made a basic mistake using the wrong pointer the line clSetKernelArg(kernel, 1, Sizeof.cl_mem, Pointer.to(outputBuffer));
it actually should be clSetKernelArg(kernel, 1, Sizeof.cl_mem, Pointer.to(outputsMem));
Upvotes: 0