Reputation: 1197
I am trying to store the image, which is generated by the CUDA-OpenGL interop example in the 'CUDA-By Example' textbook, into a memory buffer that can store to images.
I want to store two images, one which is a green "X" and another which is an orangish "X", in a memory buffer. When I render the pBuffer with OpenGL, I should get a green "X" image like the example output, however, I just get a black screen. I am not sure why I am not getting the right output. Could someone please tell me what's wrong?
I obtained the code for the memory buffer from A Memory buffer for multiple images
#include "book.h"
#include "cpu_bitmap.h"
#include "cuda.h"
#include <cuda_gl_interop.h>
PFNGLBINDBUFFERARBPROC glBindBuffer = NULL;
PFNGLDELETEBUFFERSARBPROC glDeleteBuffers = NULL;
PFNGLGENBUFFERSARBPROC glGenBuffers = NULL;
PFNGLBUFFERDATAARBPROC glBufferData = NULL;
#define DIM 512
#define IMAGESIZE_MAX (DIM*DIM) // MY CHANGE
GLuint bufferObj;
cudaGraphicsResource *resource;
// based on ripple code, but uses uchar4 which is the type of data
// graphic inter op uses. see screenshot - basic2.png
__global__ void kernel( uchar4 *ptr1)
{
// map from threadIdx/BlockIdx to pixel position
int x = threadIdx.x + blockIdx.x * blockDim.x;
int y = threadIdx.y + blockIdx.y * blockDim.y;
int offset = x + y * blockDim.x * gridDim.x ;
// now calculate the value at that position
float fx = x/(float)DIM - 0.5f;
float fy = y/(float)DIM - 0.5f;
unsigned char green = 128 + 127 * tan( abs(fx*100) - abs(fy*100) );
// accessing uchar4 vs unsigned char*
ptr1[offset].x = 0;
ptr1[offset].y = green;
ptr1[offset].z = 0;
ptr1[offset].w = 255;
}
// MY CODE
__global__ void kernel2( uchar4 *ptr2)
{
// map from threadIdx/BlockIdx to pixel position
int x = threadIdx.x + blockIdx.x * blockDim.x;
int y = threadIdx.y + blockIdx.y * blockDim.y;
int offset = x + y * blockDim.x * gridDim.x ;
// now calculate the value at that position
float fx = x/(float)DIM - 0.5f;
float fy = y/(float)DIM - 0.5f;
unsigned char green = 128 + 127 * tan( abs(fx*100) - abs(fy*100) );
// accessing uchar4 vs unsigned char*
ptr2[offset].x = 1000;
ptr2[offset].y = green;
ptr2[offset].z = 0;
ptr2[offset].w = 255;
}
__global__ void copy ( uchar4 *pBuffer, uchar4 *Ptr, uchar4 *Ptr2, size_t size, int a )
{
int x = threadIdx.x + blockIdx.x * blockDim.x;
int y = threadIdx.y + blockIdx.y * blockDim.y;
int idx = x + y * blockDim.x * gridDim.x ;
int bdx = idx;
if (a==1)
{
while ( idx < DIM*DIM)
{
pBuffer[idx] = Ptr[idx] ;
__syncthreads();
if (idx==DIM*DIM)
{
break;
}
}
}
if (a==2)
{
while ( (idx < DIM*DIM) && (bdx < DIM*DIM) )
{
uchar4 temp = Ptr2[bdx];
__syncthreads();
pBuffer[idx+4] = temp;
__syncthreads();
if ((idx==DIM*DIM) && (bdx==DIM*DIM))
{
break;
}
}
}
}
void key_func( unsigned char key, int x, int y )
{
switch (key)
{
case 27:
// clean up OpenGL and CUDA
( cudaGraphicsUnregisterResource( resource ) );
glBindBuffer( GL_PIXEL_UNPACK_BUFFER_ARB, 0 );
glDeleteBuffers( 1, &bufferObj );
exit(0);
}
}
void draw_func( void ) {
// we pass zero as the last parameter, because out bufferObj is now
// the source, and the field switches from being a pointer to a
// bitmap to now mean an offset into a bitmap object
glDrawPixels( DIM, DIM, GL_RGBA, GL_UNSIGNED_BYTE, 0 );
glutSwapBuffers();
}
int main( int argc, char **argv )
{
cudaDeviceProp prop;
int dev;
memset( &prop, 0, sizeof( cudaDeviceProp ) );
prop.major = 1;
prop.minor = 0;
( cudaChooseDevice( &dev, &prop ) );
// tell CUDA which dev we will be using for graphic interop
// from the programming guide: Interoperability with OpenGL
// requires that the CUDA device be specified by
// cudaGLSetGLDevice() before any other runtime calls.
( cudaGLSetGLDevice( dev ) );
// these GLUT calls need to be made before the other OpenGL
// calls, else we get a seg fault
glutInit( &argc, argv );
glutInitDisplayMode( GLUT_DOUBLE | GLUT_RGBA );
glutInitWindowSize( DIM, DIM );
glutCreateWindow( "bitmap" );
glBindBuffer = (PFNGLBINDBUFFERARBPROC)GET_PROC_ADDRESS("glBindBuffer");
glDeleteBuffers = (PFNGLDELETEBUFFERSARBPROC)GET_PROC_ADDRESS("glDeleteBuffers");
glGenBuffers = (PFNGLGENBUFFERSARBPROC)GET_PROC_ADDRESS("glGenBuffers");
glBufferData = (PFNGLBUFFERDATAARBPROC)GET_PROC_ADDRESS("glBufferData");
// the first three are standard OpenGL, the 4th is the CUDA reg
// of the bitmap these calls exist starting in OpenGL 1.5
glGenBuffers( 1, &bufferObj );
glBindBuffer( GL_PIXEL_UNPACK_BUFFER_ARB, bufferObj );
glBufferData( GL_PIXEL_UNPACK_BUFFER_ARB, DIM * DIM * 4 ,NULL, GL_DYNAMIC_DRAW_ARB );
// REGISTER THE GL BufferObj and CUDA Resource
( cudaGraphicsGLRegisterBuffer( &resource, bufferObj, cudaGraphicsMapFlagsNone ) );
// do work with the memory dst being on the GPU, gotten via mapping
HANDLE_ERROR( cudaGraphicsMapResources( 1, &resource, NULL ) );
// MY MODIFIED CODE
uchar4 *devPtr;
size_t size;
size_t sizeTotal = 0;
cudaMalloc ( (uchar4 **)&devPtr, size);
uchar4 *devPtr2;
cudaMalloc ( (uchar4 **)&devPtr2, size);
uchar4 *pBuffer;
(cudaMalloc ( (uchar4 **)&pBuffer, size));
uchar4 *pBufferCurrent;
(cudaMalloc ( (uchar4 **)&pBufferCurrent, size));
uchar4 *pBufferImage;
(cudaMalloc ( (uchar4 **)&pBufferImage, size));
// REGISTER THE C BUFFER and CUDA Resource
HANDLE_ERROR(
cudaGraphicsResourceGetMappedPointer( (void**)&pBufferImage,
&size,
resource) );
dim3 grids(DIM/16,DIM/16);
dim3 threads(16,16);
kernel<<<grids,threads>>>(devPtr);
kernel2<<<grids,threads>>>(devPtr2);
int a = 1;
do
{
if (a==1)
{
copy<<< grids, threads>>>(pBufferImage, devPtr, devPtr2, size, a);
}
if(a==2)
{
copy<<< grids, threads>>>(pBufferImage, devPtr, devPtr2, size, a);
}
a++;
} while (a<=2);
cudaGraphicsUnmapResources( 1, &resource, NULL ) );
// set up GLUT and kick off main loop
glutKeyboardFunc( key_func );
glutDisplayFunc( draw_func );
glutMainLoop();
}
Upvotes: 3
Views: 1480
Reputation: 151869
Start by doing proper cuda error checking on all your cuda API calls (e.g. cudaMemcpy, etc.) and kernel calls.
When you do that you'll discover that your kernels are not running successfully. These types of things won't work:
uchar4 *devPtr; // you've just created an unallocated NULL host pointer
size_t img1_size = IMAGESIZE_MAX;
kernel<<<grids,threads>>>(devPtr); // this kernel will fail
uchar4 *devPtr2; // you've just created an unallocated NULL host pointer
size_t img2_size = IMAGESIZE_MAX;
kernel2<<<grids,threads>>>(devPtr2); // this kernel will fail
devPtr
and devPtr2
in the above code are NULL pointers. You haven't allocated any storage associated with them. Furthermore, since you are passing them to device kernels, they need to be allocated with cudaMalloc
or similar API function, in order for the pointers to be usable in device code.
Since they are not allocated with cudaMalloc
, as soon as you try to dereference those pointers in device code, you'll create a kernel fault. This will be evident if you do error checking, as you will have "unspecified launch failure" or similar report from those kernels.
I think there are probably a number of other problems in your code, but first you should do proper cuda error checking and at least get your code to the point where everything you've written is, in fact, running.
And the code you've posted doesn't actually compile.
After fixing the compile errors I also discovered that you have another infinite loop:
cudaMalloc ( (uchar4 **)&pBufferCurrent, sizeTotal + sizeof(size) + size);
cudaMalloc ( (uchar4 **)&pBuffer, sizeTotal + sizeof(size) + size);
do
{
if (!pBufferCurrent)
{
break;
}
pBuffer = pBufferCurrent;
pBufferCurrent += sizeTotal;
imageget ( pBufferCurrent + sizeof(size), size, devPtr);
sizeTotal += (sizeof(size) + size);
} while (a==1);
Since a
is initialized to 1 in your loop, and nothing in the loop modifies a
, the loop will never exit based on the while
condition. Since pBufferCurrent is also never zero if it's been properly set up by cudaMalloc
, the break
will never be taken.
If you malloc
or cudaMalloc
a pointer called pBufferCurrent
, it's hard for me to imagine under what circumstances this would ever make sense:
pBufferCurrent += sizeTotal;
and although this is legal, I don't see how this makes sense:
pBuffer = pBufferCurrent;
You just created an allocation for pBuffer
using cudaMalloc
, but the first thing you do is throw it away?
Upvotes: 2