Reputation: 63
am trying to calculate the average 256 sets of data that are 8192 bytes long. I have a kernel that works at 216 data sets but any more and the kernel returns 0 for each average. I am using a very basic reduction system to calculate the average.
Graphics Card: GTX 780 Ti
Here is my code
__global__ void Average(double *Input, int Length, int Sets, double *Average, int N) {
unsigned int Pos = (blockDim.x * blockIdx.x) + threadIdx.x;
unsigned int Offset;
int i = Length / N;
if (Pos < i * Sets) {
Offset = ((Pos / i) * Length) + (Pos % i);
Input[Offset] += Input[Offset + i];
}
__syncthreads();
if (N == Length) {
Average[Pos] = Input[Pos*Length] / Length;
}
}
using namespace std;
int main()
{
const int Length = 8192;
const int Sets =256;
const int Width = Length*Sets;
double *GPU_Average, *GPU_Data;
cudaMalloc((void**)&GPU_Average, CameraWidth*sizeof(double)*Sets);
cudaMalloc((void**)&GPU_Data, CameraWidth*sizeof(double)*Width);
double CPU_Data[Width];
double CPU_Average[Sets];
for (int i = 0; i < Width; i++) {
CPU_Data[i] = i;
}
cudaMemcpy(GPU_Data, CPU_Data, sizeof(double)*Width, cudaMemcpyHostToDevice);
int N = 2;
int Total, Blocks, Threads;
while (N < Length+1) {
Total = (Sets*Length) / N;
if (Total > 1024) {
Threads = 1024;
Blocks = Total / Threads;
}
else {
Threads = Total;
Blocks = 1;
}
Average << < Blocks, Threads>> >(GPU_Data, Length, Sets, GPU_Average, N);
N *= 2;
}
cudaMemcpy(CPU_Average, (GPU_Average), sizeof(double)*Sets, cudaMemcpyDeviceToHost);
return 0;
}
Any help on this matter is appreciated.
Upvotes: 0
Views: 59
Reputation: 63
I had not realised in my actual code (not the one above) i had wrote
cudaMalloc((void**)&GPU_Data, Width*sizeof(double)*Width);
instead of
cudaMalloc((void**)&GPU_Data, sizeof(double)*Width);
this was allocating too much memory and causing errors.
Upvotes: 1