Reputation: 182
I am new to CUDA and trying to get a grasp for the basic so I apologize if something I ask or say sounds overly simple. I wrote some serial code in C for generating an array with random numbers and then finding the max within this array.
#include <stdio.h>
#include <stdlib.h> /* srand, rand */
#include <time.h> /* time */
#define num 100000
int *arr,max = -1;
int getRand() {
double r1=rand()/(double)RAND_MAX; // Generates value between 0 & 1
return (r1 * num) + 1;
}
void generateRandom(int M) {
int i;
for(i=0;i<M;i++) {
arr[i] = getRand();
}
}
void getMax(int M) {
int i;
for(i=0;i<M;i++) {
if(arr[i] > max)
max = arr[i];
}
}
int main(int argc, char *argv[] ){
if (argc == 2) {
int M;
/* initialize random seed: */
srand (time(NULL));
M = atoi(argv[1]);
//int arr[M];
arr = (int*)calloc(M,sizeof(int));;
//printf("M = %d MAX = %d\n", M, RAND_MAX);
generateRandom(M);
getMax(M);
printf("Max value: %d",max);
}
else
printf("Invalid arguments.");
return 0;
}
I am now trying to convert this code into a simple CUDA program. I tried just making the generateRandom function run as a kernel but I am getting problems with the memory management.
#include <stdio.h>
#include <stdlib.h> /* srand, rand */
#include <time.h> /* time */
#include <cuda.h>
#define num 100000
int *arr,max = -1;
int getRand() {
double r1=rand()/(double)RAND_MAX; // Generates value between 0 & 1
return (r1 * num) + 1;
}
void generateRandom(int M) {
int i;
for(i=0;i<M;i++) {
arr[i] = getRand();
}
}
__global__ void getMax(int M) {
int i;
for(i=0;i<M;i++) {
if(arr[i] > max)
max = arr[i];
}
}
int main(int argc, char *argv[] ){
if (argc == 2) {
int M;
/* initialize random seed: */
srand (time(NULL));
M = atoi(argv[1]);
//int arr[M];
arr = (int*)calloc(M,sizeof(int));
//printf("M = %d MAX = %d\n", M, RAND_MAX);
generateRandom(M);
getMax<<<1,1>>>(M);
printf("Max value: %d",max);
}
else
printf("Invalid arguments.");
return 0;
}
That code resulted in the following errors.
cudabasic.cu(23): warning: a host variable "arr" cannot be directly read in >a device function
cudabasic.cu(23): warning: a host variable "max" cannot be directly read in >a device function
cudabasic.cu(24): warning: a host variable "arr" cannot be directly read in >a device function
cudabasic.cu(24): warning: a host variable "max" cannot be directly written >in a device function
I googled the error and found out that the problem was that I was passing global variables to a kernel and thus the device wasnt able to read it. Following an online suggestion I tried to solve this by using pointers rather than passing actual variables but I am still getting errors.
#include <stdio.h>
#include <stdlib.h> /* srand, rand */
#include <time.h> /* time */
#include <cuda.h>
#define num 100000
int *arr,max = -1;
int getRand() {
double r1=rand()/(double)RAND_MAX; // Generates value between 0 & 1
return (r1 * num) + 1;
}
void generateRandom(int M) {
int i;
for(i=0;i<M;i++) {
arr[i] = getRand();
}
}
__global__ void getMax(int M, int *dArr, int *dMax) {
int i = threadIdx.x;
int a = dArr[i];
for(i=0;i<M;i++) {
if(a > dMax)
dMax = a;
}
}
int main(int argc, char *argv[] ){
if (argc == 2) {
int M;
/* initialize random seed: */
srand (time(NULL));
M = atoi(argv[1]);
//int arr[M];
arr = (int*)calloc(M,sizeof(int));
devArr = (int*)cudaMalloc(M,sizeof(int));
//printf("M = %d MAX = %d\n", M, RAND_MAX);
generateRandom(M);
getMax<<<1,1>>>(M, arr, max);
printf("Max value: %d",max);
}
else
printf("Invalid arguments.");
return 0;
}
cudabasic.cu(24): error: operand types are incompatible ("int" and "int *")
cudabasic.cu(25): error: a value of type "int" cannot be assigned to an >entity of type "int *"
Can someone point me in the right direction of how to best go about doing this this?
I am new to CUDA and trying to get a grasp for the basic so I apologize if something I ask or say sounds overly simple.
Upvotes: 2
Views: 1627
Reputation: 151889
The best advice I can offer is to study some introductory CUDA programming material such as this. Your code displays a lack of understanding not only of CUDA, but of basic C concepts (like variables must be defined before they are used in expressions.) As a CUDA programmer, don't "flush" your knowledge of how to write proper C or C++ code. If you google things like "gtc cuda intro" or "gtc cuda optimization" you'll find good CUDA learning material.
The approach you're following, which is to take a single-threaded C/C++ code, and convert it to run using a single CUDA thread might give you some warm and fuzzy feelings about "learning CUDA" but you're not really tackling any of the important concepts - and it shows in the code you're struggling with right now.
To get the last code you presented functional a few more steps are needed:
In CUDA, device pointers can't normally be dereferenced in host code, and normally, host pointers can't be used in device code. This means that you generally shouldn't pass host pointers to a device kernel:
getMax<<<1,1>>>(M, arr, max);
^^^ ^^^
you were on the way to fixing the arr
issue with your devArray
(although your cudaMalloc
is not set up properly), we just need to fix it and complete that with an additional cudaMemcpy
operation to copy the host data to the device. If you're not sure how to use a function like cudaMalloc
, don't just guess your way through it and use casts to force types to other types - that's often a sign that you're not handling it correctly:
devArr = (int*)cudaMalloc(M,sizeof(int));
instead refer to the documentation. We also need to handle max
correctly - it is currently a host pointer and we will need a device copy of that data.
Your kernel is also a bit jumbled. Since you are launching only a single CUDA thread, your threadIdx.x
variable will only (ever) be zero:
int i = threadIdx.x;
int a = dArr[i];
but the for-loop in the kernel will work, we just need to move some lines around.
Although you haven't gotten to the point of a compilable, runnable code, it's always a good idea to do proper cuda error checking. I've added my own version to the code below.
The following code has the above issues addressed, and seems to return a sane result:
#include <stdio.h>
#include <stdlib.h> /* srand, rand */
#include <time.h> /* time */
#include <cuda.h>
#define num 100000
#define cudaCheckErrors(msg) \
do { \
cudaError_t __err = cudaGetLastError(); \
if (__err != cudaSuccess) { \
fprintf(stderr, "Fatal error: %s (%s at %s:%d)\n", \
msg, cudaGetErrorString(__err), \
__FILE__, __LINE__); \
fprintf(stderr, "*** FAILED - ABORTING\n"); \
exit(1); \
} \
} while (0)
int *arr,my_max = -1;
int getRand() {
double r1=rand()/(double)RAND_MAX; // Generates value between 0 & 1
return (r1 * num) + 1;
}
void generateRandom(int M) {
int i;
for(i=0;i<M;i++) {
arr[i] = getRand();
}
}
__global__ void getMax(int M, int *dArr, int *dMax) {
for(int i=0;i<M;i++) {
int a = dArr[i];
if(a > *dMax)
*dMax = a;
}
}
int main(int argc, char *argv[] ){
if (argc == 2) {
int M;
int *devArr, *devMax;
/* initialize random seed: */
srand (time(NULL));
M = atoi(argv[1]);
//int arr[M];
arr = (int*)calloc(M,sizeof(int));
cudaMalloc(&devArr,M*sizeof(int));
cudaCheckErrors("cudaMalloc 1 fail");
cudaMalloc(&devMax,sizeof(int));
cudaCheckErrors("cudaMalloc 2 fail");
cudaMemset(devMax, 0, sizeof(int));
cudaCheckErrors("cudaMemset fail");
//printf("M = %d MAX = %d\n", M, RAND_MAX);
generateRandom(M);
cudaMemcpy(devArr, arr, M*sizeof(int), cudaMemcpyHostToDevice);
cudaCheckErrors("cudaMemcpy 1 fail");
getMax<<<1,1>>>(M, devArr, devMax);
cudaMemcpy(&my_max, devMax, sizeof(int), cudaMemcpyDeviceToHost);
cudaCheckErrors("cudaMemcpy 2/kernel fail");
printf("Max value: %d \n", my_max);
}
else
printf("Invalid arguments.");
return 0;
}
After you've understood the above changes, you'll want to go back to my original advice and get some organized CUDA learning. At that point, if you want to re-visit max-finding, then the "good" way to do it is with a proper parallel reduction technique. A "reduction" is an algorithm that takes a (large) data set and returns a single number or a small set of numbers as a result. Finding the max in an array is an example of a "reduction". You can learn more about proper CUDA parallel reductions by studying this and working through the CUDA parallel reduction sample code.
Upvotes: 7