Reputation: 177
I have this code and it crashes every 1-2th launch. I have tried use malloc/cudaMallocHost/cudeMalloc but it was useless. It think it happens due to manual cufftComplex initialization but prove it a can't because without data I can't get fft. Could you help me eliminate this crashes?
#include <stdio.h>
#include <string.h>
#include <iostream>
#include <fstream>
#include <conio.h>
#include <cuda.h>
#include <cufft.h>
using namespace std;
int main(int argc, char **argv)
{
cufftHandle plan;
cufftComplex *data;
cufftComplex *digits;
cufftComplex *h_data;
cudaMallocHost((void**)&digits, sizeof(cufftComplex)*8);
digits[0].x = 12.5f; digits[0].y = 0.0f;
digits[1].x = 66.23f; digits[1].y = 0.0f;
digits[2].x = 35.1f; digits[2].y = 0.0f;
digits[3].x = 16.7f; digits[3].y = 0.0f;
digits[4].x = 14.83f; digits[4].y = 0.0f;
digits[5].x = 55.1f; digits[5].y = 0.0f;
digits[6].x = 11.7f; digits[6].y = 0.0f;
digits[7].x = 18.83f; digits[7].y = 0.0f;
cudaMalloc((void**)&data, sizeof(cufftComplex)*8);
cudaMemcpy(data, digits, sizeof(cufftComplex)*8, cudaMemcpyHostToDevice);
if (cufftPlan1d(&plan, 8, CUFFT_C2C, 1) != CUFFT_SUCCESS) {
fprintf(stderr, "Cuda: cufftPlan1d CUFFT_C2C failed\n");
return 1;
}
if (cufftExecC2C(plan, data, data, CUFFT_FORWARD) != CUFFT_SUCCESS) {
fprintf(stderr, "Cuda: cufftExecC2C CUFFT_FORWARD failed\n");
return 1;
}
if (cudaMalloc((void**)&h_data, sizeof(cufftComplex)*8) != cudaSuccess) {
fprintf(stderr, "Cuda: cudaMalloc((void**)&h_data failed\n");
return 1;
}
cudaMemcpy(h_data, data, sizeof(cufftComplex)*8, cudaMemcpyDeviceToHost);
printf("\nOriginal:\n");
for(int i = 0; i < 8; ++i){
printf("\nRe:%2.5f Im:%2.5f", digits[i].x, digits[i].y);
}
printf("\n\n1D-FFT:\n");
for(int i = 0; i < 8; ++i){
printf("\nRe:%2.5f Im:%2.5f", h_data[i].x, h_data[i].y);
}
cudaFree(digits);
cudaFree(data);
cudaFree(h_data);
cufftDestroy(plan);
}
Upvotes: 0
Views: 1400
Reputation: 1932
instead of:
if (cudaMalloc((void**)&h_data, sizeof(cufftComplex)*8) != cudaSuccess) {
fprintf(stderr, "Cuda: cudaMalloc((void**)&h_data failed\n");
return 1;
}
try:
if (cudaMallocHost((void**)&h_data, sizeof(cufftComplex)*8) != cudaSuccess) {
fprintf(stderr, "Cuda: cudaMalloc((void**)&h_data failed\n");
return 1;
}
since you're copying bytes back to the cpu.
The documentation provides hints on cudaMallocHost
vs malloc
usage:
cudaError_t cudaMallocHost ( void ** ptr, size_t size )
Allocates size bytes of host memory that is page-locked and accessible to the device. The driver tracks the virtual memory ranges allocated with this function and automatically accelerates calls to functions such as cudaMemcpy*(). Since the memory can be accessed directly by the device, it can be read or written with much higher bandwidth than pageable memory obtained with functions such as malloc(). Allocating excessive amounts of memory with cudaMallocHost() may degrade system performance, since it reduces the amount of memory available to the system for paging. As a result, this function is best used sparingly to allocate staging areas for data exchange between host and device.
Upvotes: 1
Reputation: 72339
You basic problem is improper mixing of host and device memory pointers. You have assigned the address of a device memory allocation (using cudaMalloc) to h_data
, but are trying to use it as a pointer to an address in host memory. That won't work and is producing the host segmentation fault you are seeing. Your example should look something like:
#include <cstdlib>
#include <cuda_runtime.h>
#include <cufft.h>
int main(int argc, char **argv)
{
cufftHandle plan;
cufftComplex *data, *digits, *h_data;
digits = (cufftComplex *)malloc(sizeof(cufftComplex)*8);
digits[0].x = 12.5f; digits[0].y = 0.0f;
digits[1].x = 66.23f; digits[1].y = 0.0f;
digits[2].x = 35.1f; digits[2].y = 0.0f;
digits[3].x = 16.7f; digits[3].y = 0.0f;
digits[4].x = 14.83f; digits[4].y = 0.0f;
digits[5].x = 55.1f; digits[5].y = 0.0f;
digits[6].x = 11.7f; digits[6].y = 0.0f;
digits[7].x = 18.83f; digits[7].y = 0.0f;
cudaMalloc((void**)&data, sizeof(cufftComplex)*8);
cudaMemcpy(data, digits, sizeof(cufftComplex)*8, cudaMemcpyHostToDevice);
if (cufftPlan1d(&plan, 8, CUFFT_C2C, 1) != CUFFT_SUCCESS) {
fprintf(stderr, "Cuda: cufftPlan1d CUFFT_C2C failed\n");
return 1;
}
if (cufftExecC2C(plan, data, data, CUFFT_FORWARD) != CUFFT_SUCCESS) {
fprintf(stderr, "Cuda: cufftExecC2C CUFFT_FORWARD failed\n");
return 1;
}
h_data = (cufftComplex *)malloc(sizeof(cufftComplex)*8);
cudaMemcpy(h_data, data, sizeof(cufftComplex)*8, cudaMemcpyDeviceToHost);
printf("\nOriginal:\n");
for(int i = 0; i < 8; ++i){
printf("\nRe:%2.5f Im:%2.5f", digits[i].x, digits[i].y);
}
printf("\n\n1D-FFT:\n");
for(int i = 0; i < 8; ++i){
printf("\nRe:%2.5f Im:%2.5f", h_data[i].x, h_data[i].y);
}
free(digits);
free(h_data);
cudaFree(data);
cufftDestroy(plan);
}
Note that you should use plain malloc
or the C++ new
operator to allocate host side memory rather than cudaMallocHost
, unless you understand very well what the latter API does and why you are using it.
Upvotes: 1
Reputation: 15734
If you are on Linux, try running running your app under the CUDA debugger, with memory checking turned on:
nvcc –g –G myapp.cu –o myapp
cuda-gdb myapp
(cuda-gdb) set cuda memcheck on
(cuda-gdb) run
If you are on Windows, select Nsight | Enable CUDA Memory Checker
and run your code from the built-in CUDA debugger. Nsight | Start CUDA Debugging
.
Also, add code to your app to check the return values from each of the CUDA calls.
See for more information about the CUDA Memchecker:
http://developer.nvidia.com/nvidia-gpu-computing-documentation#CUDA-MemcheckUserManual
Upvotes: 0