Eagle
Eagle

Reputation: 1197

C/CUDA - Modifying CUDA/GL interop example to store image in a memory buffer

I am trying to store the image, which is generated by the CUDA-OpenGL interop example in the 'CUDA-By Example' textbook, into a memory buffer that can store to images.

I want to store two images, one which is a green "X" and another which is an orangish "X", in a memory buffer. When I render the pBuffer with OpenGL, I should get a green "X" image like the example output, however, I just get a black screen. I am not sure why I am not getting the right output. Could someone please tell me what's wrong?

I obtained the code for the memory buffer from A Memory buffer for multiple images

    #include "book.h"
    #include "cpu_bitmap.h"
    #include "cuda.h"
    #include <cuda_gl_interop.h>

    PFNGLBINDBUFFERARBPROC    glBindBuffer     = NULL;
    PFNGLDELETEBUFFERSARBPROC glDeleteBuffers  = NULL;
    PFNGLGENBUFFERSARBPROC    glGenBuffers     = NULL;
    PFNGLBUFFERDATAARBPROC    glBufferData     = NULL;

    #define     DIM    512

    #define IMAGESIZE_MAX (DIM*DIM)  // MY CHANGE

    GLuint  bufferObj;
    cudaGraphicsResource *resource;

    // based on ripple code, but uses uchar4 which is the type of data
    // graphic inter op uses. see screenshot - basic2.png

    __global__ void kernel( uchar4 *ptr1)   
    {
   // map from threadIdx/BlockIdx to pixel position
    int x = threadIdx.x + blockIdx.x * blockDim.x;
    int y = threadIdx.y + blockIdx.y * blockDim.y;
    int offset = x + y * blockDim.x * gridDim.x ;

    // now calculate the value at that position
    float fx = x/(float)DIM - 0.5f;
    float fy = y/(float)DIM - 0.5f;
    unsigned char   green = 128 + 127 * tan( abs(fx*100) - abs(fy*100) );

    // accessing uchar4 vs unsigned char*
    ptr1[offset].x = 0;
    ptr1[offset].y = green;
    ptr1[offset].z = 0;
    ptr1[offset].w = 255;

    }

    // MY CODE

    __global__ void kernel2( uchar4 *ptr2) 
    {
    // map from threadIdx/BlockIdx to pixel position
    int x = threadIdx.x + blockIdx.x * blockDim.x;
    int y = threadIdx.y + blockIdx.y * blockDim.y;
    int offset = x + y * blockDim.x * gridDim.x ;

    // now calculate the value at that position
    float fx = x/(float)DIM - 0.5f;
    float fy = y/(float)DIM - 0.5f;
    unsigned char   green = 128 + 127 * tan( abs(fx*100) - abs(fy*100) );

    // accessing uchar4 vs unsigned char*
    ptr2[offset].x = 1000;
    ptr2[offset].y = green;
    ptr2[offset].z = 0;
    ptr2[offset].w = 255;
    }

    __global__ void copy ( uchar4 *pBuffer, uchar4 *Ptr, uchar4 *Ptr2, size_t size, int a )
    {

    int x = threadIdx.x + blockIdx.x * blockDim.x;
    int y = threadIdx.y + blockIdx.y * blockDim.y;
    int idx = x + y * blockDim.x * gridDim.x ;
    int bdx = idx;

    if (a==1)
    {
    while ( idx < DIM*DIM)
    {
    pBuffer[idx] = Ptr[idx] ;
    __syncthreads();
    if (idx==DIM*DIM)
    {
     break;
    }
    }
    }


    if (a==2)
    {
    while ( (idx < DIM*DIM) && (bdx < DIM*DIM) )
    {
    uchar4 temp = Ptr2[bdx];
    __syncthreads();

    pBuffer[idx+4] = temp;
    __syncthreads();

    if ((idx==DIM*DIM) && (bdx==DIM*DIM))
    {
    break;
    }
    }  
    }

    }


    void key_func( unsigned char key, int x, int y ) 
    {
    switch (key) 
    {
    case 27:
        // clean up OpenGL and CUDA
        ( cudaGraphicsUnregisterResource( resource ) );
        glBindBuffer( GL_PIXEL_UNPACK_BUFFER_ARB, 0 );
        glDeleteBuffers( 1, &bufferObj );
        exit(0);
    }
    }

    void draw_func( void ) {
    // we pass zero as the last parameter, because out bufferObj is now
    // the source, and the field switches from being a pointer to a
    // bitmap to now mean an offset into a bitmap object
    glDrawPixels( DIM, DIM, GL_RGBA, GL_UNSIGNED_BYTE, 0 );
    glutSwapBuffers();
    }


    int main( int argc, char **argv ) 
    {
    cudaDeviceProp  prop;
    int dev;

    memset( &prop, 0, sizeof( cudaDeviceProp ) );
    prop.major = 1;
    prop.minor = 0;
    ( cudaChooseDevice( &dev, &prop ) );

    // tell CUDA which dev we will be using for graphic interop
    // from the programming guide:  Interoperability with OpenGL
    //     requires that the CUDA device be specified by
    //     cudaGLSetGLDevice() before any other runtime calls.

    ( cudaGLSetGLDevice( dev ) );

    // these GLUT calls need to be made before the other OpenGL
    // calls, else we get a seg fault
    glutInit( &argc, argv );
    glutInitDisplayMode( GLUT_DOUBLE | GLUT_RGBA );
    glutInitWindowSize( DIM, DIM );
    glutCreateWindow( "bitmap" );

    glBindBuffer    = (PFNGLBINDBUFFERARBPROC)GET_PROC_ADDRESS("glBindBuffer");
    glDeleteBuffers = (PFNGLDELETEBUFFERSARBPROC)GET_PROC_ADDRESS("glDeleteBuffers");
    glGenBuffers    = (PFNGLGENBUFFERSARBPROC)GET_PROC_ADDRESS("glGenBuffers");
    glBufferData    = (PFNGLBUFFERDATAARBPROC)GET_PROC_ADDRESS("glBufferData");

    // the first three are standard OpenGL, the 4th is the CUDA reg 
    // of the bitmap these calls exist starting in OpenGL 1.5
    glGenBuffers( 1, &bufferObj );
    glBindBuffer( GL_PIXEL_UNPACK_BUFFER_ARB, bufferObj );
    glBufferData( GL_PIXEL_UNPACK_BUFFER_ARB, DIM * DIM * 4 ,NULL, GL_DYNAMIC_DRAW_ARB );

    // REGISTER THE GL BufferObj and CUDA Resource
    ( cudaGraphicsGLRegisterBuffer( &resource, bufferObj, cudaGraphicsMapFlagsNone ) );

    // do work with the memory dst being on the GPU, gotten via mapping
    HANDLE_ERROR( cudaGraphicsMapResources( 1, &resource, NULL ) );

    // MY MODIFIED CODE

    uchar4 *devPtr; 
    size_t size;
    size_t sizeTotal = 0;
    cudaMalloc ( (uchar4 **)&devPtr,  size);

    uchar4 *devPtr2; 

    cudaMalloc ( (uchar4 **)&devPtr2,  size); 

    uchar4 *pBuffer;

    (cudaMalloc ( (uchar4 **)&pBuffer,  size));

    uchar4 *pBufferCurrent;

    (cudaMalloc ( (uchar4 **)&pBufferCurrent,  size));

    uchar4 *pBufferImage;

    (cudaMalloc ( (uchar4 **)&pBufferImage,  size));

    // REGISTER THE C BUFFER and CUDA Resource

    HANDLE_ERROR( 
    cudaGraphicsResourceGetMappedPointer( (void**)&pBufferImage, 
                                          &size, 
                                          resource) );

    dim3    grids(DIM/16,DIM/16);
    dim3    threads(16,16);


    kernel<<<grids,threads>>>(devPtr);
    kernel2<<<grids,threads>>>(devPtr2);

    int a = 1;
    do 
    {

    if (a==1)
    {
    copy<<< grids, threads>>>(pBufferImage, devPtr, devPtr2, size, a);  
    }

    if(a==2)
    {
    copy<<< grids, threads>>>(pBufferImage, devPtr, devPtr2, size, a);
    }

    a++;

    } while (a<=2); 

    cudaGraphicsUnmapResources( 1, &resource, NULL ) );

    // set up GLUT and kick off main loop
    glutKeyboardFunc( key_func );
    glutDisplayFunc( draw_func );
    glutMainLoop();
   }

Upvotes: 3

Views: 1480

Answers (1)

Robert Crovella
Robert Crovella

Reputation: 151869

Start by doing proper cuda error checking on all your cuda API calls (e.g. cudaMemcpy, etc.) and kernel calls.

When you do that you'll discover that your kernels are not running successfully. These types of things won't work:

uchar4 *devPtr;   // you've just created an unallocated NULL host pointer
size_t img1_size = IMAGESIZE_MAX;

kernel<<<grids,threads>>>(devPtr);  // this kernel will fail


uchar4 *devPtr2;   // you've just created an unallocated NULL host pointer
size_t img2_size = IMAGESIZE_MAX;

kernel2<<<grids,threads>>>(devPtr2);  // this kernel will fail

devPtr and devPtr2 in the above code are NULL pointers. You haven't allocated any storage associated with them. Furthermore, since you are passing them to device kernels, they need to be allocated with cudaMalloc or similar API function, in order for the pointers to be usable in device code.

Since they are not allocated with cudaMalloc, as soon as you try to dereference those pointers in device code, you'll create a kernel fault. This will be evident if you do error checking, as you will have "unspecified launch failure" or similar report from those kernels.

I think there are probably a number of other problems in your code, but first you should do proper cuda error checking and at least get your code to the point where everything you've written is, in fact, running.

And the code you've posted doesn't actually compile.

After fixing the compile errors I also discovered that you have another infinite loop:

cudaMalloc ( (uchar4 **)&pBufferCurrent,  sizeTotal + sizeof(size) + size); 
cudaMalloc ( (uchar4 **)&pBuffer,  sizeTotal + sizeof(size) + size);

do 
{

if (!pBufferCurrent)
{
break;
}

pBuffer = pBufferCurrent;

pBufferCurrent += sizeTotal;

imageget ( pBufferCurrent + sizeof(size), size, devPtr);

sizeTotal += (sizeof(size) + size);

} while (a==1); 

Since a is initialized to 1 in your loop, and nothing in the loop modifies a, the loop will never exit based on the while condition. Since pBufferCurrent is also never zero if it's been properly set up by cudaMalloc, the break will never be taken.

If you malloc or cudaMalloc a pointer called pBufferCurrent, it's hard for me to imagine under what circumstances this would ever make sense:

pBufferCurrent += sizeTotal;

and although this is legal, I don't see how this makes sense:

pBuffer = pBufferCurrent;

You just created an allocation for pBuffer using cudaMalloc, but the first thing you do is throw it away?

Upvotes: 2

Related Questions