Yoelson
Yoelson

Reputation: 41

Registering Mapped Linux Character Device Memory with cudaHostRegister Results in Invalid Argument

I'm trying to boost DMA<->CPU<->GPU data transfer by: 1. Mapping my (proprietary) device Linux Kernel allocated memory to user space 2. Registering the later (mapped memory) to Cuda with cudaHostRegister API function.

While mapping User Space allocated memory mapped to my device DMA and then registered to Cuda with cudaHostRegister works just fine, trying to register "kmalloced" memory results in "Invalid Argument" error returned by cudaHostRegister.

First I thought the problem was with alignment or my device driver complicated memory pool management, so I've written a simplest character device which implements .mmap() where kzalloced 10Kb buffer is remapped with remap_pfn_range and the problem still stands.

Unfortunately, I did not find any resembling questions over the Net, so I sincerely hope I'll find an answer here.

Some system info and Kernel driver <-> user space app code + runtime log info:

CUDA    : 8.0
OS Dist : Ubuntu 14.04
Kernel  : 3.16.0-31-generic
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 375.26                 Driver Version: 375.26                    |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|===============================+======================+======================|                                                                               
|   0  GeForce GTX 770     Off  | 0000:83:00.0     N/A |                  N/A |
| 26%   32C    P8    N/A /  N/A |     79MiB /  1997MiB |     N/A      Default |
+-------------------------------+----------------------+----------------------+

Character device mmap() code:

#define MEM_CHUNK_SIZE  4 * _K
#define MEM_POOL_SIZE   10 * _K
/**/
static int  chdv_mmap(struct file *filp, struct vm_area_struct *vma)
{
    unsigned int pages_per_buf = ( MEM_CHUNK_SIZE >> PAGE_SHIFT ) ;
    unsigned long pfn, vsize;

    /*make sure the buffer is allocated*/
    if((NULL == g_membuff) && 
       (NULL == (g_membuff = kzalloc(MEM_POOL_SIZE , GFP_KERNEL))))
    {
        kdbgprintln("Error: Not enough memory");
        return -ENOMEM;
    }

    vsize = vma->vm_end - vma->vm_start ;

    kdbgprintln("MEM_CHUNK_SIZE %u, pages_per_buf %u, vsize %lu  vma->vm_pgoff %lu",
            MEM_CHUNK_SIZE,
            pages_per_buf,
            vsize,
            vma->vm_pgoff);
    if(vsize > MEM_POOL_SIZE)
    {
        kdbgprintln("Error: vsize %lu > MEM_POOL_SIZE %u", vsize, MEM_POOL_SIZE);
        return -EINVAL;
    }

    /* We allow only mapping of one whole buffer so offset must be multiple
     * of pages_per_buf and size must be equal to dma_buf_size.
     */
    if( vma->vm_pgoff % pages_per_buf ) 
    {
        kdbgprintln("Error:Mapping DMA buffers is allowed only from beginning");
        return -EINVAL ;
    }

    vma->vm_flags = vma->vm_flags | (VM_DONTEXPAND | VM_LOCKED | VM_IO);

    /*Get the PFN for remap*/
    pfn = page_to_pfn(virt_to_page((unsignedcudaHostRegister  char *)g_membuff));

    kdbgprintln("PFN : %lu", pfn);

    if(remap_pfn_range(vma, vma->vm_start, pfn, vsize, vma->vm_page_prot))
    {
        kdbgprintln("Error:Failed to remap memory");
        return -EINVAL;
    }

    /*Sealing data header & footer*/
    *((unsigned long *)g_membuff)       = 0xCDFFFFFFFFFFFFAB;
    *((unsigned long *)g_membuff + 1)   = 0xAB000000000000EF;
    *(unsigned long *)((unsigned char *)g_membuff + vsize - sizeof(unsigned long)) = 0xEF0000000C0000AA;

    kdbgprintln("Mapped 'kalloc' buffer" \
            "\n\t\tFirst  8 bytes: %lX" \
            "\n\t\tSecond 8 bytes: %lX" \
            "\n\t\tLast   8 bytes: %lX",
            *((unsigned long *)g_membuff),
            *((unsigned long *)g_membuff + 1),
            *(unsigned long *)((unsigned char *)g_membuff + vsize - sizeof(unsigned long)));

    return 0;
}

Test Application code:

static unsigned long map_mem_size;

int main(int argc, char** argv)
{
    int fd;
    const char dev_name[] = "/dev/chardev";
    void * address = NULL;
    long page_off = 0;
    cudaError_t cudarc;

    switch(argc)
    {
    case 2:
        page_off = atoi(argv[1]) * getpagesize();
        break;
    default:
        page_off = 0;
        break;
    }

    map_mem_size = 2 * getpagesize();

    printf("Opening %s file\n", dev_name);
    errno = 0;
    if(0 > (fd = open(dev_name, O_RDWR) ))
    {
        printf("Error %d - %s\n", errno, strerror(errno));
    }
    else
    {
        printf("About to map %lu bytes of %s device memory\n", map_mem_size, dev_name);

        errno = 0;
        if(MAP_FAILED == (address = mmap(NULL, map_mem_size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, page_off)))
        {
            printf("Error %d - %s\n", errno, strerror(errno));
        }
        else
        {
            printf("mapped %s driver 'kmalloc' memory" \
                    "\n\t\tFirst  8 bytes : %lX" \
                    "\n\t\tSecond 8 bytes: %lX" \
                    "\n\t\tLast   8 bytes: %lX\n",
                    dev_name,
                    *((unsigned long *)address),
                    *((unsigned long *)address + 1),
                    *(unsigned long *)((unsigned char *)address + map_mem_size - sizeof(unsigned long)));

            if (cudaSuccess != (cudarc = cudaHostRegister(address, map_mem_size, cudaHostRegisterDefault)))
            {
                printf("Error: Failed cudaHostRegister: %s, address %p\n", cudaGetErrorString(cudarc), address);
            }
        }
    }

    /*Release resources block*/

    return EXIT_SUCCESS;
}

Run time debug information:

User space:

./chrdev_test 
Opening /dev/chardev file
About to map 8192 bytes of /dev/chardev device memory
mapped /dev/chardev driver 'kmalloc' memory
                First  8 bytes : CDFFFFFFFFFFFFAB
                Second 8 bytes: AB000000000000EF
                Last   8 bytes: EF0000000C0000AA
Error: Failed cudaHostRegister: invalid argument
Unmapping /dev/chardev file
Closing /dev/chardev file

Kernel space (tail -f /var/log/syslog):

 [ 4814.119537] [chardev] chardev.c, chdv_mmap, line 292:MEM_CHUNK_SIZE 4096, pages_per_buf 1, vsize 8192  vma->vm_pgoff 0
    [ 4814.119538] [chardev] chardev.c, chdv_mmap, line 311:PFN : 16306184
    [ 4814.119543] [chardev] chardev.c, chdv_mmap, line 330:Mapped 'kzalloced' buffer
    [ 4814.119543]           First  8 bytes: CDFFFFFFFFFFFFAB
    [ 4814.119543]           Second 8 bytes: AB000000000000EF
    [ 4814.119543]           Last   8 bytes: EF0000000C0000AA

Thanks ahead.

Upvotes: 1

Views: 1034

Answers (1)

Yoelson
Yoelson

Reputation: 41

Made it work!

Thanks, Yoel.

Upvotes: 1

Related Questions