Multi GPU clCreateBuffer failure on single context

Question

I have two Intel ARC A770 GPUs and I am seeing this behavior (test code attached).

I am trying to allocate memory using clCreateBuffer (1 MB each).
I have two GPU devices.
The test code creates a context, loads a kernel program, builds it.
The test program creates only one context (and can do that for 1 device or 2 devices).
When I run the code in 1 GPU mode, I am able to allocate memory (over 10000 allocations). But when I run the code in 2 GPU mode, I run out of memory (out of host memory after 1000 allocations -- around 1 GB memory). OpenCL error code -6.
I am able to allocate memory on both devices if I create two contexts.

The GPUs have 16 GB RAM and my host has 192 GB RAM. Any idea what I am doing wrong? The command to build the code is:

gcc -D CL_TARGET_OPENCL_VERSION=220 -g -Wall -o OpenCLMulti OpenCLMulti.cpp -lOpenCL -lm

Test code: OpenCLMulti.cpp

#include 
#include 
#include 
#include 
#include 
#include 
#include 
#include 
#include 
#include 
#include 

#include 
#include 

#define MAX_PLATFORMS 1
#define MAX_GPUS 2

cl_context context;
cl_program program;
cl_device_id devices[MAX_GPUS] = {0};
cl_platform_id platformId[MAX_PLATFORMS];

const char* kernelFileName = "OpenCLKernels.cl";

unsigned char* getKernelCode(const char* filename, size_t* psize) {
  FILE* fp;
  size_t ret, size;
  unsigned char* bCode;

  fp = fopen(filename, "rb");
  if (fp == NULL) {
    fprintf(stderr, "Could not open kernels source file: %s", filename);
    return NULL;
  }

  fseek(fp, 0, SEEK_END);
  size = ftell(fp);
  rewind(fp);

  bCode = (unsigned char*) malloc(size);

  if ((ret = fread(bCode, 1, size, fp)) != size) {
    fprintf(stderr, "Could not read %ld bytes, got %ld", ret, size);
    return NULL;
  }
  fclose(fp);
  *psize = size;
  return bCode;
}

const char* buildOptions = "-DOPENCL -D CL_TARGET_OPENCL_VERSION=220";

void contextCallback(const char* errInfo, const void* privateInfo, size_t cb, void* userData) {
  printf("contextCallback %s
", errInfo);
}

cl_int setupGPU(int gpus, int* ngpus) {
  size_t size;
  cl_int err = CL_SUCCESS;
  char buffer[16384];
  cl_uint numPlatforms, ngpusOCL;
  unsigned char* bCode;

  err = clGetPlatformIDs(MAX_PLATFORMS, &platformId[0], &numPlatforms);
  if (err != CL_SUCCESS) {
    return err;
  }

  if (numPlatforms == 0) {
    fprintf(stderr, "No OpenCL platforms found
");
    return -100;
  }

  if (numPlatforms > 1) {
    fprintf(stderr, "Found more than one OpenCL platform. Choosing first.
");
  }

  err = clGetPlatformInfo(platformId[0], CL_PLATFORM_PROFILE, sizeof(buffer), buffer, &size);
  fprintf(stderr, "Platform profile: %s
", buffer);

  err = clGetPlatformInfo(platformId[0], CL_PLATFORM_VERSION, sizeof(buffer), buffer, &size);
  fprintf(stderr, "Platform version: %s
", buffer);

  err = clGetPlatformInfo(platformId[0], CL_PLATFORM_NAME, sizeof(buffer), buffer, &size);
  fprintf(stderr, "Name: %s
", buffer);

  cl_context_properties properties[] = {CL_CONTEXT_PLATFORM, (cl_context_properties) platformId[0], 0};
  err = clGetDeviceIDs(platformId[0], CL_DEVICE_TYPE_GPU, MAX_GPUS, &devices[0], &ngpusOCL);
  if (err != CL_SUCCESS) {
    return err;
  }
  fprintf(stderr, "Found %d devices
", ngpusOCL);


  bCode = getKernelCode(kernelFileName, &size);
  if (bCode == NULL) {
    fprintf(stderr, "Could not load kernel code
");
    return -1;
  }

  context = clCreateContext(properties, gpus, devices, contextCallback, NULL, &err);
  if (err != CL_SUCCESS) {
    fprintf(stderr, "Could not create GPU context: %d
", err);
    return err;
  }

  fprintf(stderr, "%d: OpenCL context created successfully
", 0);

  program = clCreateProgramWithSource(context, 1, (const char**) &bCode, &size, &err);
  if (program == NULL) {
    fprintf(stderr, "Could not create program from file %s, error %d
", kernelFileName, err);
    return err;
  }
  fprintf(stderr, "%d: Program loaded successfully
", 0);

  err = clBuildProgram(program, gpus, devices, buildOptions, NULL, NULL);
  if (err != CL_SUCCESS) {
    fprintf(stderr, "%d: Program build failed %d
", 0, err);
    size = 1024 * 1024 * 4;
    char* output = (char*) malloc(size);
    err = clGetProgramBuildInfo(program, devices[0], CL_PROGRAM_BUILD_LOG, sizeof(1024 * 1024 * 4), output, &size);
    if (err != CL_SUCCESS) {
      fprintf(stderr, "%d: Program build failed. Unable to get log. Error %d
", 0, err);
      return err;
    }
    fprintf(stderr, "%d: %s
", 0, output);
    free(output);
    return err;
  }
  free(bCode);

  fprintf(stderr, "OpenCL kernels built successfully
");
  *ngpus = ngpusOCL;
  return err;
}

int main(int argc, char* argv[]) {
  int ngpus, dgpus;
  cl_int err;
  size_t asize;
  cl_mem mem;

  ngpus = 1;
  for (int i = 1; i < argc; i++) {
    if (strcmp(argv[i], "--gpus") == 0) {
      ngpus = atoi(argv[i + 1]);
      i++;
    }
  }
  err = setupGPU(ngpus, &dgpus);

  if (err != CL_SUCCESS) {
    exit(1);
  }

  if (dgpus < ngpus) {
    fprintf(stderr, "Detected %d GPUs, requested %d. Exiting
", dgpus, ngpus);
    exit(1);
  }

  asize = 1 * 1024 * 1024L;
  for (int i = 0; i < 100000; i++) {
    mem = clCreateBuffer(context, CL_MEM_READ_WRITE, asize, NULL, &err);
    if (err != CL_SUCCESS) {
      fprintf(stderr, "Could not create memory, err %d", err);
      return err;
    }

    fprintf(stderr, "%d: Allocated %ld MB successfully
", i, asize / (1024 * 1024));
  }

  exit(0);

  exit(0);
}

Simple kernel: OpenCLKernels.cl

__kernel void dAxpy(int elements, float alpha, __global float *xData, int xOffset, int incX, __global float *yData, int yOffset, int incY) {
  long idx;
  int threadId, blockIdx, blockIdy, blockDim, gridDim;

  threadId = get_local_id(0); blockIdx = get_group_id(0); blockIdy = get_group_id(1); blockDim = get_local_size(0); gridDim = get_num_groups(0);
  idx = (blockIdx + blockIdy * gridDim) * blockDim + threadId;
  if (idx < elements) {
    yData[yOffset + idx * incY] += alpha * xData[xOffset + idx * incX];
  }
}

The above code is:

A simple test program that allocates 1 MB buffers using a single context. When I register one device in the context, I am able to allocate over 1000 buffers (approximate 1 GB of memory). When I register two devices in the context, I get out-of-host-memory error at the 1000th allocation. I expect I should be able to allocate more buffers.

Multi GPU clCreateBuffer failure on single context

Answers (0)

Related Questions