kyle moats
kyle moats

Reputation: 21

how do i execute both cufftXt and CUDA kernels on multiple GPUs?

I would like to use two GPUs to execute a kernel then execute a single FFT using cufftXt. The data could be several GBs in size. My understanding of allocating memory for kernels on 2 GPUs is that you should split the host array in half and send the first half to GPU0 and the other half to GPU1. The following example shows how this could be done.

#include <iostream>
#define _USE_MATH_DEFINES
#include <math.h>
#include <ctime>
#include <fstream>
#include <sstream>
#include <cstdlib>
#include <string>
#include <stdlib.h>
#include <stdio.h>

#include <cuda_runtime.h>
#include <cufft.h>
#include <cufftXt.h>

using namespace std;

#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=true)
{
   if (code != cudaSuccess) 
   {
      fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
      if (abort) exit(code);
   }
}

__global__ void Cube (cufftReal *data, cufftReal *data3, int N, int real_size) {

    int i = blockIdx.x * blockDim.x + threadIdx.x;

    if (i<real_size){
    float x = (i % (N+2));
    if(x < N){
            data3[i] = pow(data[i], 3.0f);  
    }
    else{
            data3[i] = 0.0f;        
    }
    }

    __syncthreads();
}


int main (int argc, char **argv) {

    int x;
    int N = 8;

        int cplx_size = N * (N/2 + 1);
        int real_size = 2 * cplx_size;
    int mem_size = sizeof(cufftReal)*real_size;
    int half_real_size = real_size/2;
    int half_mem_size = mem_size/2;

    cufftReal *h_data = (cufftReal*)malloc(mem_size);
    cufftReal *h_data3 = (cufftReal*)malloc(mem_size);

    cufftReal *h0_data = (cufftReal*)malloc(half_mem_size);
    cufftReal *h0_data3 = (cufftReal*)malloc(half_mem_size);
    cufftReal *h1_data = (cufftReal*)malloc(half_mem_size);
    cufftReal *h1_data3 = (cufftReal*)malloc(half_mem_size);

    for(int i=0; i<real_size; i++){
            x = (i % (N+2));
        if(x < N){h_data[i] = 2;}
        else{h_data[i] = 0;}
    }

    for(int i=0; i<half_real_size; i++){
        h0_data[i] = h_data[i];
        h1_data[i] = h_data[i+half_real_size];
    }

    cufftReal *d0_data;
    cufftReal *d0_data3;
    cufftReal *d1_data;
    cufftReal *d1_data3;

    cudaSetDevice(0);
    gpuErrchk(cudaMalloc((void**)&d0_data, half_mem_size));
    gpuErrchk(cudaMalloc((void**)&d0_data3, half_mem_size));
    cudaSetDevice(1);
    gpuErrchk(cudaMalloc((void**)&d1_data, half_mem_size));
    gpuErrchk(cudaMalloc((void**)&d1_data3, half_mem_size));

cout <<"device memory allocated" <<endl;

    int maxThreads=(N>1024)?1024:N;
    int threadsPerBlock = maxThreads;
    int numBlocks = (half_real_size)/threadsPerBlock;

    cudaSetDevice(0);
    gpuErrchk(cudaMemcpy(d0_data, h0_data, half_mem_size, cudaMemcpyHostToDevice));
    cudaSetDevice(1);
    gpuErrchk(cudaMemcpy(d1_data, h1_data, half_mem_size, cudaMemcpyHostToDevice));

cout <<"mem copied to devices" <<endl;

        cudaSetDevice(0);
        Cube <<<numBlocks, threadsPerBlock>>> (d0_data, d0_data3, N, half_real_size);
        gpuErrchk( cudaPeekAtLastError() );
        gpuErrchk( cudaDeviceSynchronize() );
        cudaSetDevice(1);
        Cube <<<numBlocks, threadsPerBlock>>> (d1_data, d1_data3, N, half_real_size);
        gpuErrchk( cudaPeekAtLastError() );
        gpuErrchk( cudaDeviceSynchronize() );

    cudaSetDevice(0);
    gpuErrchk(cudaMemcpy(h0_data3, d0_data3, half_mem_size, cudaMemcpyDeviceToHost));
    cudaSetDevice(1);
    gpuErrchk(cudaMemcpy(h1_data3, d1_data3, half_mem_size, cudaMemcpyDeviceToHost));   

    cout <<endl;
    for(int i = 0; i<half_real_size; i++){
        cout <<h0_data3[i] <<" ";
    }
    cout <<endl;
    for(int i = 0; i<half_real_size; i++){
        cout <<h1_data3[i] <<" ";
    }

    //clean up
    cudaFree(d0_data);
    cudaFree(d0_data3);
    cudaFree(d1_data);
    cudaFree(d1_data3);   
    return 0;
}

However, I do not see how this approach is compatible with cufftXt. It appears that I should use the helper function cufftXtMemcpy to automatically split up the data onto the devices. But if I do that, then the multi-gpu kernel method shown above is not useable unless I allocate separate device memory for cufftXt and kernels. Is there any way to run both cufftXt and kernels without doubly allocating device memory?

Upvotes: 0

Views: 454

Answers (1)

kyle moats
kyle moats

Reputation: 21

Here is how I did it, following the simpleCUFFT_2d_MGPU code sample from the toolkit. I am not sure if it is completely correct. It is 50% slower on 2 GPUs than it was using only 1. I tested it this code (versus another code using R2C and C2R FFTs) on Tesla K40 GPUs.

#include <iostream>
#define _USE_MATH_DEFINES
#include <math.h>
#include <ctime>
#include <fstream>
#include <sstream>
#include <cstdlib>
#include <string>
#include <stdlib.h>
#include <stdio.h>

#include <cuda_runtime.h>
#include <cufft.h>
#include <cufftXt.h>

using namespace std;

__global__ void Cube (cufftComplex *data, cufftComplex *data3, int N, int n, int nGPUs) {

    int i = blockIdx.x * blockDim.x + threadIdx.x;   

    if (i<n){
        data3[i].x = pow(data[i].x, 3.0f);  
    data3[i].y = 0;
    }

    __syncthreads();
}

__global__ void Normalize (cufftComplex *data, int N, int n, int nGPUs){

    int i = blockIdx.x * blockDim.x + threadIdx.x;   

    if (i<n){
    data[i].x /= n;
    }

    __syncthreads();
}

int main (int argc, char **argv) {

    int x, y;
    int N = 8192;
    int n = N*N;

        //int cplx_size = N * (N/2 + 1);
        //int real_size = 2 * cplx_size;
    int mem_size = sizeof(cufftComplex)*n;

    int maxThreads=(N>1024)?1024:N;
    int threadsPerBlock = maxThreads;
    int numBlocks = (n)/threadsPerBlock;
    cout <<"numBlocks " <<numBlocks <<endl;

    cufftComplex *h_data; 
    h_data = (cufftComplex*)malloc(mem_size);
    cufftComplex *h_data3 = (cufftComplex*)malloc(mem_size);

cout <<"host data allocated" <<endl;

    int index;
    float lambda = N*.1;
    for(y=0; y<N; y++){
    for(x=0; x<N; x++){
        //cout <<x <<" " <<y <<endl;
        index = x + y*N;
        h_data[index].x = cos(2*M_PI*(x+y)/lambda);
        h_data[index].y = 0;
    }
    }

cout <<"host data values set" <<endl;

    cufftResult res;
    int  device;
    int nGPUs;
    cudaGetDeviceCount(&nGPUs);
    cout <<nGPUs <<" CUDA devices" <<endl;

    size_t total_mem, free_mem;
    for(int i=0; i<nGPUs; i++){
        cudaMemGetInfo(&free_mem, &total_mem);
        cout <<"GPU" <<i <<" used memory " <<(total_mem-free_mem)/pow(10,9);
    }

    int whichGPUs[nGPUs];
    for(int i=0; i<nGPUs; i++){
        whichGPUs[i]=i;
    }

cout <<"whichgpus set" <<endl;

        size_t* worksize;
        worksize =(size_t*)malloc(sizeof(size_t) * nGPUs);

cout <<"worksize set" <<endl;

    cufftHandle plan_complex;
    res = cufftCreate(&plan_complex);
        if (res != CUFFT_SUCCESS){cout <<"create plan failed" <<endl;}
    res = cufftXtSetGPUs(plan_complex, nGPUs, whichGPUs);
        if (res != CUFFT_SUCCESS){cout <<"setgpus forward failed" <<endl;}
cout <<"set gpus" <<endl;
    res = cufftMakePlan2d(plan_complex, N, N, CUFFT_C2C, worksize);
        if (res != CUFFT_SUCCESS){cout <<"make plan forward failed" <<endl;}

cout <<"plan created" <<endl;

    cudaLibXtDesc *d_data; 
    cudaLibXtDesc *d_data3;

    res = cufftXtMalloc(plan_complex, (cudaLibXtDesc **)&d_data, CUFFT_XT_FORMAT_INPLACE);
        if (res != CUFFT_SUCCESS){cout <<"data malloc failed" <<endl;}

    res = cufftXtMalloc(plan_complex, (cudaLibXtDesc **)&d_data3, CUFFT_XT_FORMAT_INPLACE);
        if (res != CUFFT_SUCCESS){cout <<"data3 malloc failed" <<endl;}

cout <<"xtmalloc done" <<endl;

    res = cufftXtMemcpy (plan_complex, d_data, h_data, CUFFT_COPY_HOST_TO_DEVICE);
        if (res != CUFFT_SUCCESS){cout <<"memcpy to device failed" <<endl;}

cout <<"memcpy h to d" <<endl;

int tmax = 10000;
int start = time(0);

for(int tau=0; tau<tmax; tau++){

    res = cufftXtExecDescriptorC2C(plan_complex, d_data, d_data, CUFFT_FORWARD);
        if (res != CUFFT_SUCCESS){cout <<"cufftXtExec failed" <<endl; return 0;}

    res = cufftXtExecDescriptorC2C(plan_complex, d_data, d_data, CUFFT_INVERSE);
        if (res != CUFFT_SUCCESS){cout <<"cufftXtExec failed" <<endl; return 0;}

    for(int i=0; i<nGPUs; i++){
        device = d_data->descriptor->GPUs[i];
        cudaSetDevice(device);
        Normalize <<<numBlocks, threadsPerBlock>>> ((cufftComplex*) d_data->descriptor->data[i], N, n, nGPUs);
    }

    cudaDeviceSynchronize();

}

int stop = time(0);
cout <<tmax <<" timesteps" <<endl <<(stop-start) <<" seconds"<<endl;

/*
    for(int i=0; i<nGPUs; i++){
        device = d_data->descriptor->GPUs[i];
        cudaSetDevice(device);
        Cube <<<numBlocks, threadsPerBlock>>> ((cufftComplex*) d_data->descriptor->data[i], (cufftComplex*) d_data3->descriptor->data[i], N, real_size);
    }
*/

/*
    cudaDeviceSynchronize();

    res = cufftXtMemcpy (plan_complex, h_data, d_data, CUFFT_COPY_DEVICE_TO_HOST);
        if (res != CUFFT_SUCCESS){cout <<"memcpy to host failed" <<endl;}

cout <<"memcpy d to h" <<endl;

    ofstream fout;
    ostringstream outstr;
    outstr.precision(4);
    outstr <<time(0) <<".dat";
    string filename=outstr.str();
    fout.open(filename.c_str());
    fout.precision(4);
    for (int i = 0; i < n; i++) {
        x = (i % (N));
            y = (i /(N))%N; 
            fout <<x <<" " <<y <<" " <<h_data[i].x <<endl;
    }
    fout.close();  
*/

    //clean up
    res = cufftXtFree(d_data);
        if (res != CUFFT_SUCCESS){cout <<"free data failed" <<endl;}
    res = cufftXtFree(d_data3);
        if (res != CUFFT_SUCCESS){cout <<"free data3 failed" <<endl;}
    cufftDestroy(plan_complex);

    return 0;
}

Upvotes: 1

Related Questions