Reputation: 33
I have two Intel ARC A770 GPUs and I am seeing this behavior (test code attached).
The GPUs have 16 GB RAM and my host has 192 GB RAM. Any idea what I am doing wrong? The command to build the code is:
gcc -D CL_TARGET_OPENCL_VERSION=220 -g -Wall -o OpenCLMulti OpenCLMulti.cpp -lOpenCL -lm
Test code: OpenCLMulti.cpp
#include <sys/time.h>
#include <sys/sysinfo.h>
#include <sys/stat.h>
#include <assert.h>
#include <errno.h>
#include <math.h>
#include <getopt.h>
#include <sys/time.h>
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <CL/cl.h>
#include <clBLAS.h>
#define MAX_PLATFORMS 1
#define MAX_GPUS 2
cl_context context;
cl_program program;
cl_device_id devices[MAX_GPUS] = {0};
cl_platform_id platformId[MAX_PLATFORMS];
const char* kernelFileName = "OpenCLKernels.cl";
unsigned char* getKernelCode(const char* filename, size_t* psize) {
FILE* fp;
size_t ret, size;
unsigned char* bCode;
fp = fopen(filename, "rb");
if (fp == NULL) {
fprintf(stderr, "Could not open kernels source file: %s", filename);
return NULL;
}
fseek(fp, 0, SEEK_END);
size = ftell(fp);
rewind(fp);
bCode = (unsigned char*) malloc(size);
if ((ret = fread(bCode, 1, size, fp)) != size) {
fprintf(stderr, "Could not read %ld bytes, got %ld", ret, size);
return NULL;
}
fclose(fp);
*psize = size;
return bCode;
}
const char* buildOptions = "-DOPENCL -D CL_TARGET_OPENCL_VERSION=220";
void contextCallback(const char* errInfo, const void* privateInfo, size_t cb, void* userData) {
printf("contextCallback %s\n", errInfo);
}
cl_int setupGPU(int gpus, int* ngpus) {
size_t size;
cl_int err = CL_SUCCESS;
char buffer[16384];
cl_uint numPlatforms, ngpusOCL;
unsigned char* bCode;
err = clGetPlatformIDs(MAX_PLATFORMS, &platformId[0], &numPlatforms);
if (err != CL_SUCCESS) {
return err;
}
if (numPlatforms == 0) {
fprintf(stderr, "No OpenCL platforms found\n");
return -100;
}
if (numPlatforms > 1) {
fprintf(stderr, "Found more than one OpenCL platform. Choosing first.\n");
}
err = clGetPlatformInfo(platformId[0], CL_PLATFORM_PROFILE, sizeof(buffer), buffer, &size);
fprintf(stderr, "Platform profile: %s\n", buffer);
err = clGetPlatformInfo(platformId[0], CL_PLATFORM_VERSION, sizeof(buffer), buffer, &size);
fprintf(stderr, "Platform version: %s\n", buffer);
err = clGetPlatformInfo(platformId[0], CL_PLATFORM_NAME, sizeof(buffer), buffer, &size);
fprintf(stderr, "Name: %s\n", buffer);
cl_context_properties properties[] = {CL_CONTEXT_PLATFORM, (cl_context_properties) platformId[0], 0};
err = clGetDeviceIDs(platformId[0], CL_DEVICE_TYPE_GPU, MAX_GPUS, &devices[0], &ngpusOCL);
if (err != CL_SUCCESS) {
return err;
}
fprintf(stderr, "Found %d devices\n", ngpusOCL);
bCode = getKernelCode(kernelFileName, &size);
if (bCode == NULL) {
fprintf(stderr, "Could not load kernel code\n");
return -1;
}
context = clCreateContext(properties, gpus, devices, contextCallback, NULL, &err);
if (err != CL_SUCCESS) {
fprintf(stderr, "Could not create GPU context: %d\n", err);
return err;
}
fprintf(stderr, "%d: OpenCL context created successfully\n", 0);
program = clCreateProgramWithSource(context, 1, (const char**) &bCode, &size, &err);
if (program == NULL) {
fprintf(stderr, "Could not create program from file %s, error %d\n", kernelFileName, err);
return err;
}
fprintf(stderr, "%d: Program loaded successfully\n", 0);
err = clBuildProgram(program, gpus, devices, buildOptions, NULL, NULL);
if (err != CL_SUCCESS) {
fprintf(stderr, "%d: Program build failed %d\n", 0, err);
size = 1024 * 1024 * 4;
char* output = (char*) malloc(size);
err = clGetProgramBuildInfo(program, devices[0], CL_PROGRAM_BUILD_LOG, sizeof(1024 * 1024 * 4), output, &size);
if (err != CL_SUCCESS) {
fprintf(stderr, "%d: Program build failed. Unable to get log. Error %d\n", 0, err);
return err;
}
fprintf(stderr, "%d: %s\n", 0, output);
free(output);
return err;
}
free(bCode);
fprintf(stderr, "OpenCL kernels built successfully\n");
*ngpus = ngpusOCL;
return err;
}
int main(int argc, char* argv[]) {
int ngpus, dgpus;
cl_int err;
size_t asize;
cl_mem mem;
ngpus = 1;
for (int i = 1; i < argc; i++) {
if (strcmp(argv[i], "--gpus") == 0) {
ngpus = atoi(argv[i + 1]);
i++;
}
}
err = setupGPU(ngpus, &dgpus);
if (err != CL_SUCCESS) {
exit(1);
}
if (dgpus < ngpus) {
fprintf(stderr, "Detected %d GPUs, requested %d. Exiting\n", dgpus, ngpus);
exit(1);
}
asize = 1 * 1024 * 1024L;
for (int i = 0; i < 100000; i++) {
mem = clCreateBuffer(context, CL_MEM_READ_WRITE, asize, NULL, &err);
if (err != CL_SUCCESS) {
fprintf(stderr, "Could not create memory, err %d", err);
return err;
}
fprintf(stderr, "%d: Allocated %ld MB successfully\n", i, asize / (1024 * 1024));
}
exit(0);
exit(0);
}
Simple kernel: OpenCLKernels.cl
__kernel void dAxpy(int elements, float alpha, __global float *xData, int xOffset, int incX, __global float *yData, int yOffset, int incY) {
long idx;
int threadId, blockIdx, blockIdy, blockDim, gridDim;
threadId = get_local_id(0); blockIdx = get_group_id(0); blockIdy = get_group_id(1); blockDim = get_local_size(0); gridDim = get_num_groups(0);
idx = (blockIdx + blockIdy * gridDim) * blockDim + threadId;
if (idx < elements) {
yData[yOffset + idx * incY] += alpha * xData[xOffset + idx * incX];
}
}
The above code is:
A simple test program that allocates 1 MB buffers using a single context. When I register one device in the context, I am able to allocate over 1000 buffers (approximate 1 GB of memory). When I register two devices in the context, I get out-of-host-memory error at the 1000th allocation. I expect I should be able to allocate more buffers.
Upvotes: 1
Views: 46