Reputation: 79
I recently setup Ubuntu 14.04.1 LTS. I am compiling a small matrix addition program I wrote, matrixAddition.cu. I compile the code like so: nvcc matrixAddition.cu -o matAdd. The produced matrix should show 7 7 7 ... as I am adding an array of 5 with an array of 2. However, it prints 5 5 5 ... for me. Is the kernel failing to work for some reason? Am I missing some code? Thanks for reading.
matrixAddition.cu
#include <iostream>
#include <stdio.h>
__global__ void matAdd(int * d_arra, int * d_arrb, int * length)
{
int id = threadIdx.x;
if(id<*length)
{
d_arra[id]=d_arra[id]+d_arrb[id];
}
}
int main () {
//cpu varriables
int arra[100];
int arrb[100];
int leng = 100;
//gpu varriables
int * d_arra;
int * d_arrb;
int * length;
//-std=c++11
for(int itr=0;itr<100;itr++){arra[itr]=5;arrb[itr]=2;}
cudaMalloc( (void**)&d_arra, 100*sizeof(int));
cudaMalloc( (void**)&d_arrb, 100*sizeof(int));
cudaMemcpy( d_arra, arra, 100*sizeof(int), cudaMemcpyHostToDevice );
cudaMemcpy( d_arrb, arrb, 100*sizeof(int), cudaMemcpyHostToDevice );
cudaMemcpy( length, &leng, sizeof(int), cudaMemcpyHostToDevice );
//max thread per block 1024
matAdd<<<1, 1024>>>(d_arra, d_arrb, length);
cudaMemcpy( arra, d_arra, 100*sizeof(int), cudaMemcpyDeviceToHost );
cudaFree( d_arra );
cudaFree( d_arrb );
cudaFree( length );
std::cout << " our data \n";
for(int itr=0;itr<100;itr++){std::cout << arra[itr]; if(itr%10==0&&itr!=0){std::cout <<"\n";}}
std::cout<<std::endl;
return 0;
}
Upvotes: 1
Views: 288
Reputation: 1157
The bug is common in C. Variable "length" is not initialized. You forgot to malloc memory space for a point variable before copying data to the address.
If using type "int *", you have to do cudaMalloc for it as d_arra/d_arrb. But it's best to use type "int" instead of "int *" as the example vectorAdd in CUDA Samples, since only a single int is passed to your CUDA device.
Another small issue is in your cout part. Since array index starts from 0, you need output a line break when "(itr+1)%10==0 && itr!=0".
//cpu varriables
int arra[100];
int arrb[100];
int leng = 100;
//gpu varriables
int * d_arra;
int * d_arrb;
int * length;
//-std=c++11
for (int itr=0; itr<100; itr++)
{
arra[itr]=5;
arrb[itr]=2;
}
cudaMalloc( (void**)&d_arra, 100*sizeof(int));
cudaMalloc( (void**)&d_arrb, 100*sizeof(int));
cudaMalloc( (void**)&length, 1*sizeof(int)); // Add this line
cudaMemcpy( d_arra, arra, 100*sizeof(int), cudaMemcpyHostToDevice );
cudaMemcpy( d_arrb, arrb, 100*sizeof(int), cudaMemcpyHostToDevice );
cudaMemcpy( length, &leng, sizeof(int), cudaMemcpyHostToDevice );
//max thread per block 1024
matAdd<<<1, 1024>>>(d_arra, d_arrb, length);
cudaMemcpy( arra, d_arra, 100*sizeof(int), cudaMemcpyDeviceToHost );
cudaFree( d_arra );
cudaFree( d_arrb );
cudaFree( length );
std::cout << " our data \n";
for (int itr=0; itr<100; itr++)
{
std::cout << arra[itr];
if((itr+1)%10==0 && itr!=0)
{
std::cout <<"\n";
}
}
std::cout<<std::endl;
Upvotes: 1