Reputation: 303
I am trying to learn CUDA by writing basic code, which should hopefully put me in a better position to convert my existing C++ code to CUDA (for research).
I need to do a fair bit of complex number manipulations, so I have written this very basic code to multiply an array of complex numbers with a real number in a GPU kernel.
#include <complex>
#include <iostream>
#include <cmath>
#include "cuda.h"
#include "math.h"
#include "cuComplex.h"
#define n 5
using namespace std;
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, char *file, int line, bool abort=true)
{
if (code != cudaSuccess)
{
fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
if (abort) exit(code);
}
}
__global__ void func( double *s, cuDoubleComplex *j, cuDoubleComplex *calc ) {
int tid = blockIdx.x;
calc[tid] = cuCmul(j[tid], make_cuDoubleComplex(*s, 0));
}
int main( void ) {
cuDoubleComplex calc[n+1], *dev_j, *dev_calc;
double *dev_s, s[n+1] = { 2.0, 2.0, 2.0, 2.0, 2.0 };
//complex<double> j[n+1]
cuDoubleComplex j[n+1];
for (int i = 1; i <= n; i++) {
j[i] = make_cuDoubleComplex(0, 5);
cout << "\nJ cout = " << cuCreal(j[i]) << ", " << cuCimag(j[i]);
}
// allocate the memory on the GPU
cudaMalloc( (void**)&dev_s, (n+1) * sizeof(double) );
cudaMalloc( (void**)&dev_j, (n+1) * sizeof(double) );
cudaMalloc( (void**)&dev_calc, (n+1) * sizeof(double) );
cudaMemcpy( dev_s, s, (n+1) * sizeof(double), cudaMemcpyHostToDevice );
cudaMemcpy( dev_j, j, (n+1) * sizeof(double), cudaMemcpyHostToDevice );
func<<<n,1>>>( dev_s, dev_j, dev_calc );
//kernel<<<1,1>>>(a_d);
gpuErrchk( cudaPeekAtLastError() );
gpuErrchk( cudaMemcpy(calc, dev_calc, (n+1) * sizeof(double), cudaMemcpyDeviceToHost) );
//cudaMemcpy( calc, dev_calc, (n+1) * sizeof(double), cudaMemcpyDeviceToHost );
for (int i = 1; i <= n; i++) {
cout << "\nCALC cout = " << cuCreal(calc[i]) << ", " << cuCimag(calc[i]);
}
return 0;
}
The final answer is wrong, and I have also identified a few other places where I am not getting expected values.
1) I expected a complex double array of (0, 5i) for all elements of 'j' after the following line of code. However, I am getting all 0s. Why is that?
j[i] = make_cuDoubleComplex(0, 5);
2) Why can't I print my array using cout? The line of code shown below gives the following error : no operator "<<" matches these operands. How can I fix this without using with printf?
cout << "\nJ = " << j[i];
3) The GPU function 'func' which should give out an array of (0, 10i) as the final answer is giving random values such as these:
CALC = -1.#QNAN0
CALC = -1.#QNAN0
CALC = -9255963134931783100000000...000.. etc
CALC = -9255963134931783100000000...000.. etc
4) For my actual research, the complex array 'j' will be given in the format of complex(double) and not cuDoubleComplex. Can I do similar operations to 'j' array of complex(double)s using the function 'func'? If not, what are my options?
I think I've explained myself well, but feel free to ask any follow-up questions. New to C++ as well as CUDA so be nice :D
Upvotes: 1
Views: 9871
Reputation: 151869
When writing CUDA code, especially when you're learning or having difficulty (things aren't working the way you expect) you should always do cuda error checking on all CUDA API calls and kernel calls.
I don't think there's actually any CUDA functional errors in your code (good job!) but it's worth pointing out.
Most of your questions are due to the fact that you're not printing out the type cuDoubleComplex
properly. Your printf statements are specifying a float
format parameter (%f
) but you are not passing a float
value (you're passing a cuDoubleComplex
value). That won't work, and printf
will behave strangely when you do that, without giving any error indication.
Instead, try something like this:
printf("\nJ = %f, %f", cuCreal(j[i]), cuCimag(j[i]));
These functions (cuCreal
and cuCimag
) return the real and imaginary parts of cuComplex
numbers, and they return them as an appropriate type, float
or double
, and in this case the implicit cast from double
to float
is OK for what you're doing and can be handled by printf
(although it's not really good programming practice -- instead use the correct printf
format specifier for a double
value).
If you make that change for both of your printf statements, I think you'll get expected results -- at least I did when I ran your code. If you still get garbage, then your CUDA GPU may not be working correctly, and here is where doing that CUDA error checking I mentioned will help you discover what the problem is.
Regarding your questions concerning cout
, the answer is roughly equivalent to my explanation for what is going on with printf
. cout
doesn't understand the type cuDoubleComplex
and so throws an error. If you want to fix it without using printf
, convert your cuDoubleComplex
to its individual real and imaginary parts, represented by float
or double
, using the conversion functions I've indicated in the printf
statement above.
Regarding your last question, it should not be difficult to convert your complex
data to cuDoubleComplex
type. Write a conversion function to do it based on the utilities you have in cuComplex.h
There are back-door ways around this, but they're not good programming practice.
EDIT: In response to follow up questions, there were two more errors in the code currently posted.
dev_j
and dev_calc
are of type cuDoubleComplex
but you were doing cudaMalloc
and cudaMemcpy
on these quantities as if they were of size double
. In the following code, I changed those sizeof(double)
entries to sizeof(cuDoubleComplex)
.Here's a modification of your code that works for me:
//#include <complex> // not necessary for this code
#include <iostream>
#include <cmath>
//#include "cuda.h" // not necessary when compiling with nvcc
#include "math.h"
#include "cuComplex.h"
#define n 5
using namespace std;
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, char *file, int line, bool abort=true)
{
if (code != cudaSuccess)
{
fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
if (abort) exit(code);
}
}
__global__ void func( double *s, cuDoubleComplex *j, cuDoubleComplex *calc ) {
int tid = blockIdx.x;
calc[tid] = cuCmul(j[tid], make_cuDoubleComplex(*s, 0));
}
int main( void ) {
cuDoubleComplex calc[n+1], *dev_j, *dev_calc;
double *dev_s, s[n] = { 2.0, 2.0, 2.0, 2.0, 2.0 };
//complex<double> j[n+1]
cuDoubleComplex j[n];
for (int i = 0; i < n; i++) {
j[i] = make_cuDoubleComplex(0, 5);
cout << "\nJ cout = " << cuCreal(j[i]) << ", " << cuCimag(j[i]);
}
// allocate the memory on the GPU
cudaMalloc( (void**)&dev_s, (n) * sizeof(double) );
cudaMalloc( (void**)&dev_j, (n) * sizeof(cuDoubleComplex) );
cudaMalloc( (void**)&dev_calc, (n) * sizeof(cuDoubleComplex) );
cudaMemcpy( dev_s, s, (n) * sizeof(double), cudaMemcpyHostToDevice );
cudaMemcpy( dev_j, j, (n) * sizeof(cuDoubleComplex), cudaMemcpyHostToDevice );
func<<<n,1>>>( dev_s, dev_j, dev_calc );
//kernel<<<1,1>>>(a_d);
gpuErrchk( cudaPeekAtLastError() );
gpuErrchk( cudaMemcpy(calc, dev_calc, (n) * sizeof(cuDoubleComplex), cudaMemcpyDeviceToHost) );
//cudaMemcpy( calc, dev_calc, (n+1) * sizeof(double), cudaMemcpyDeviceToHost );
for (int i = 0; i < n; i++) {
cout << "\nCALC cout = " << cuCreal(calc[i]) << ", " << cuCimag(calc[i]);
}
return 0;
}
Upvotes: 1