Reputation: 2250
I am trying to come up with an accurate way to measure the latency of two operations:
I am using a K20x and was wondering if this code would give accurate measurements.
#include <cuda.h>
#include <stdlib.h>
#include <stdio.h>
#include <iostream>
using namespace std;
//Clock rate
#define MHZ 732e6
//number of streaming multiprocessors
#define SMS 14
// number of double precision units
#define DP_UNITS 16*4
//number of shared banks
#define SHARED_BANKS 32
#define ITER 100000
#define NEARONE 1.0000000000000004
__global__ void fma_latency_kernal(double *in, double *out){
int tid = blockIdx.x*blockDim.x+threadIdx.x;
double val = in[tid];
#pragma unroll 100
for(int i=0; i<ITER; i++){
val+=val*NEARONE;
}
out[tid]=val;
}
__global__ void shared_latency_kernel(double *in, double *out){
volatile extern __shared__ double smem[];
int tid = blockIdx.x*blockDim.x+threadIdx.x;
smem[threadIdx.x]=in[tid];
#pragma unroll 32
for(int i=0; i<ITER; i++){
smem[threadIdx.x]=smem[(threadIdx.x+i)%32]*NEARONE;
}
out[tid]=smem[threadIdx.x];
}
int main (int argc , char **argv){
float time;
cudaEvent_t start, stop, start2, stop2;
double *d_A, *d_B;
cudaMalloc(&d_A, DP_UNITS*SMS*sizeof(float));
cudaMalloc(&d_B, DP_UNITS*SMS*sizeof(float));
cudaError_t err;
cudaEventCreate(&start);
cudaEventCreate(&stop);
cudaEventRecord(start, 0);
fma_latency_kernal<<<SMS, DP_UNITS>>>(d_A, d_B);
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&time, start, stop);
time/=1000;
err = cudaGetLastError();
if(err!=cudaSuccess)
printf("Error FMA: %s\n", cudaGetErrorString(err));
printf("Latency of FMA = %3.1f clock cycles\n", (time/(double)ITER)*(double)MHZ);
cudaDeviceSetSharedMemConfig(cudaSharedMemBankSizeFourByte);
cudaEventCreate(&start2);
cudaEventCreate(&stop2);
cudaEventRecord(start2, 0);
shared_latency_kernel<<<1, SHARED_BANKS, sizeof(double)>>>(d_A, d_B );
cudaEventRecord(stop2, 0);
cudaEventSynchronize(stop2);
cudaEventElapsedTime(&time, start2, stop2);
time/=1000;
err = cudaGetLastError();
if(err!=cudaSuccess)
printf("Error Shared Memory: %s\n", cudaGetErrorString(err));
printf("Latency of Shared Memory = %3.1f clock cycles\n", time/(double)ITER*(double)MHZ);
}
My results on the K20x are the following:
This seems reasonable to me, but I am not sure how accurate it is.
Upvotes: 3
Views: 394
Reputation: 21
Your latency values look very high to me - nearly double what I'd expect. To measure how many cycles something takes on the GPU, you can insert clock() functions before and after the relevant part of the kernel function. The clock function returns the current cycle as an int, so by subtracting the first value from the second you get the the number of cycles that passed between dispatching the first clock instruction and dispatching the second clock instruction.
Note that the numbers you get from this method will include extra time from the clock instructions themselves; I believe that by default a thread will block for several cycles immediately before and after every clock instruction, so you may want to experiment with that to see how many cycles it's adding so you can subtract them back out.
Upvotes: 2