Timing using cudaEvent****() VS clock_gettime()

Question

I'm trying to time my code. I have been told that cudaEvent****() can be applied. at the meantime, my original code use clock_gettime() to time. I print results measured by both cudaEvent****() and clock_gettime() as follows. That is what I'm really confused about.

measured by cudaEvent****()

init data structure: 1971.517578ms
establish context: 0.007296ms
rearrange data: 234.271423ms
copy data: 53.402176ms
time stepping: 17221.333984ms

measured by clock_gettime()

init data structure: 1.802874s
establish context: 20.541891s
rearrange data: 0.235464s
copy data: 0.051851s
time stepping: 8.429955s

note:

init data structure: totally work on CPU
establish context: one line only: cudaFree((void*)0);
rearrange data: totally work on CPU
copy data: transfer data from host to device
time stepping: two kernel functions are involved

Q1: The time spent of "establish context" measured by cudaEvent****() (0.0072ms) is quite different from that measured by clock_gettime() (~20.5s). Actually, this part has only one line which establishes a context. cudaFree(0) How does this vast difference happen ?

Q2: The time spent of "time stepping" measured by cudaEvent****() (~17.221s) is twice as much as that measured by clock_gettime() (~8.43s). Someone tells me that asynchronization can be a possible reason, but I don't really get it. can anyone help me get through it ?

Q3: The wall clock time spent is really close to the time measured by clock_gettime(). However, I'm told that cudaEvent****() is preferable in timing a cuda code. I don't know which one I should shoose.

===============================update=================================== The following is a part of my code in which some timing functions and macros are defined.

#define TIMING 1
#if TIMING
double get_time()
{
    struct timespec time;
    clock_gettime(CLOCK_REALTIME, &time);
    return (double)time.tv_sec + (double)time.tv_nsec * 1.0e-9 ;
}
#endif
#define CUDATIMING 0
#if CUDATIMING
#define cuda_timing_init \
    cudaEvent_t startEvent, stopEvent;\
    float timeEvent;\
    cudaEventCreate(&startEvent);\
    cudaEventCreate(&stopEvent);
#define cuda_timing_begin \
    cudaEventRecord(startEvent, 0);
#define cuda_timing_stop(str) \
    cudaEventRecord(stopEvent, 0);\
    cudaEventSynchronize(stopEvent);\
    cudaEventElapsedTime(&timeEvent, startEvent, stopEvent);\
    printf("time spent of %s: %fms
", str, timeEvent);
#define cuda_timing_destroy \
    cudaEventDestroy(startEvent);\
    cudaEventDestroy(stopEvent);
#endif

I use these functions and macros to time.

===========================update 20150823===============================

Here is the basic structure of my code including timing. I'm not sure if it can help to solve my timing issue.

void
copy_float_from_host_to_dev(float *h_p, float **d_pp, int size)
{
    if_error(cudaMalloc(d_pp, size));
    if_error(cudaMemcpy(*d_pp, h_p, size, cudaMemcpyHostToDevice));
}

void
copy_int_from_host_to_dev(int *h_p, int **d_pp, int size)
{
    if_error(cudaMalloc(d_pp, size));
    if_error(cudaMemcpy(*d_pp, h_p, size, cudaMemcpyHostToDevice));
}

int
main(int argc, char **argv)
{
    // init 
    // totally CPU codes        
    // ......
#if TIMING
    double t1, t2, t3, t4, t5, t6; 
    t1 = get_time();
#endif
#if CUDATIMING
    cuda_timing_init;
    cuda_timing_begin;
#endif
    // init data structure
    // totally CPU codes
    // ......
#if TIMING
    t2 = get_time();
#endif
#if CUDATIMING
    cuda_timing_stop("init data structure");
    cuda_timing_begin;
#endif
    // establish context
    cudaFree((void*)0);
#if TIMING
    t3 = get_time();
#endif
#if CUDATIMING
    cuda_timing_stop("establish context");
    cuda_timing_begin;
#endif
    // rearrange data
    // totally CPU codes
    // data on CPU side has different structure
    // compared to data on GPU side, so I need
    // to rearrange it.
    // ......
#if TIMING
    t4 = get_time();
#endif
#if CUDATIMING
    cuda_timing_stop("rearrange data");
    cuda_timing_begin;
#endif
    // copy data from host to device
    // about 10 copies. the following are 2 of them
       // all use copy_float/int_from_host_to_dev 
    // h_lap --> d_lap
    copy_float_from_host_to_dev(h_lap, &d_lap, lapsize); 
    // h_etol --> d_etol
    copy_int_from_host_to_dev(h_etol, &d_etol, etolsize); 
    // ......
#if TIMING
    t5 = get_time();
#endif
#if CUDATIMING
    cuda_timing_stop("copy data");
    cuda_timing_begin;
#endif
    // time stepping
    for(step = 1; step < para->nstep; step++)
    {
    /* kernel_1: matrix-vector multiplication.
     * The matrix is special, so multiplication 
     * can be very fast.  
     * atomic operations are involved
     * no data transfers between host and device */
    kernel_1<<<32768, 128>>>(......);
    /* kernel_2: vector operations.
     * Assuming that a,b,c,d are vectors,
     * what kernel_2 does is: a=2*a-b+c*d 
     * no data transfers between host and device */
    kernel_2<<<16384, 128>>>(......);
    }
#if TIMING
    t6 = get_time();
    printf("total time: %fs
", t6-t1);
    printf("  init data structure: %fs
", t2-t1);
    printf("  establish context: %fs
", t3-t2);
    printf("  rearrange data: %fs
", t4-t3);
    printf("  copy data: %fs
", t5-t4);
    printf("  time stepping: %fs
", t6-t5);
#endif
#if CUDATIMING
    cuda_timing_stop("time stepping");
    cuda_timing_destroy;
#endif

    // destroy data structure
    // totally CPU codes
    // ......

    return 0;
}

Timing using cudaEvent****() VS clock_gettime()

Answers (1)

Related Questions