CUDA C/C++ accessing previous array element

Question

In learning CUDA, I'm working on a mini-project to calculate moving averages. While my simple moving average (SMA) works fine (albeit slow and unoptimized), my exponential moving average (EMA) always results in incorrect numbers.

I've figured out that the problem is that *(ema + i - 1)is always 0. This same array accessing concept works perfectly well in a test C++ file, but not in my CUDA application. I assume I just don't know some concept about pointers or CUDA.


using namespace std;

// simple_ma not included

void __global__ exponential_ma(int n, int period, float *data, float *ema){
    int i = blockIdx.x * blockDim.x + threadIdx.x;

    if(i == 0){
        *ema = *data;
    }else if(i < n){

        float k = 2.0f/(period+1);
        *(ema + i) = *(data + i)*k + *(ema + i - 1) * (1.0f-k);
        // PROBLEM OCCURS ON THE LINE ABOVE, neither does ema[i-1] work
    }
}

int main(){
    /**
     * Function that computes a moving average on a vector
     */
    int N = 1<<5; // data size
    cout << "N = " << N << " bytes = " << N*sizeof(float) << endl;
    int period = 10; // moving average period

    // malloc'ed for stack usage instead of small heap size
    float *data = (float*)malloc(N*sizeof(float));
    float *sma = (float*)malloc(N*sizeof(float));
    float *ema = (float*)malloc(N*sizeof(float));

    float *d_data; // device pointer for data
    float *d_sma; // device pointer for simple moving average
    float *d_ema; // device pointer for exponential moving average

    // CUDA allocate memory for data, SMA, and EMA
    cudaMalloc(&d_data, N*sizeof(float));
    cudaMalloc(&d_sma, N*sizeof(float));
    cudaMalloc(&d_ema, N*sizeof(float));

    // initialize data
    srand(time(0));
    data[0] = rand() % 100 + 50;
    for(int i = 1; i < N; i++){
        data[i] = data[i-1] + rand() % 11 - 5;
    }

    // copy data from host to device
    cudaMemcpy(d_data, data, N*sizeof(float), cudaMemcpyHostToDevice);
    cudaMemcpy(d_sma, sma, N*sizeof(float), cudaMemcpyHostToDevice);
    cudaMemcpy(d_ema, ema, N*sizeof(float), cudaMemcpyHostToDevice);

    // call device function
    simple_ma<<<(N+255)/256, 256>>>(N, period, d_data, d_sma);
    exponential_ma<<<(N+255)/256, 256>>>(N, period, d_data, d_ema);

    cudaMemcpy(sma, d_sma, N*sizeof(float), cudaMemcpyDeviceToHost);
    cudaMemcpy(ema, d_ema, N*sizeof(float), cudaMemcpyDeviceToHost);

    for(int i = 0; i < N; i += 1){
        cout << "i = " << i << " data = "<< data[i] << " ---sma---> " << sma[i] << " ---ema---> " << ema[i] << endl;
    }

    cudaFree(d_data);
    cudaFree(d_sma);
    cudaFree(d_ema);

    return 0;
}

Robert Crovella · Accepted Answer

Threads in CUDA can execute in any order. The calculation of ema[i-1] may not have been started by the time you are attempting to calculate ema[i] in another thread (which depends on the calculation of ema[i-1] being complete). The method that you use for a simple serial implementation of this algorithm will not work in a thread-parallel fashion

With that in mind, here is one possible approach.

First, recast your recursive ema calculation:

     ema[0] = data[0]
i>0: ema[i] = k*data[i]+(1-k)*ema[i-1]

in non-recursive form:

     ema[0] = data[0]
i>0: ema[i] = ((1-k)^i)*data[0] + ∑(((1-k)^x)*k*data[i-x])  
                                  x=0..i-1

This will inform us how to write our CUDA kernel code. If this transformation seems obscure to you, you may wish to create a table of the first few entries of the sequence, similar to the methodology described in this answer.

It works, but each thread is iterating over the entire input array up to its index. There will be one threadblock (with highest array indices) that takes longer than all the others. The worst case thread is doing approximately the same work as the serial version, so not a very interesting parallel implementation.

In order to address this, we can make an observation about the non-recursive form equation. According to your code, the term (1.0 - k) is always less than 1, since k is 2 divided by some positive integer greater than 2 (i.e. we will assume period is 2 or larger). Therefore the (1.0 - k)^x term eventually becomes vanishingly small as the summation proceeds. We will also assume that your data is bounded in range, roughly as you have shown. In that case, as the summation proceeds, eventually the terms that are summed have no appreciable effect on the float sum quantity. With these assumptions, then, we will curtail the loop processing when our (1.0 - k)^x term becomes small enough to not have a significant effect on the result.

With those assumptions and modifications, we can create a CUDA code that runs faster than the naive serial CPU version, while maintaining a small error margin.

$ cat t1444.cu
#include 
#include 
#define cudaCheckErrors(msg) \
    do { \
        cudaError_t __err = cudaGetLastError(); \
        if (__err != cudaSuccess) { \
            fprintf(stderr, "Fatal error: %s (%s at %s:%d)
", \
                msg, cudaGetErrorString(__err), \
                __FILE__, __LINE__); \
            fprintf(stderr, "*** FAILED - ABORTING
"); \
            exit(1); \
        } \
    } while (0)
#include 
#include 
#define USECPSEC 1000000ULL

unsigned long long dtime_usec(unsigned long long start){

  timeval tv;
  gettimeofday(&tv, 0);
  return ((tv.tv_sec*USECPSEC)+tv.tv_usec)-start;
}


__global__ void gpu_ema(const int n, const float k, const float * __restrict__ data, float * __restrict__ ema, const float tol){
  int i = blockIdx.x*blockDim.x+threadIdx.x;
  if (i == 0) ema[0] = data[0];
  else if (i < n){
    float sum = 0;
    float fac = 1.0f - k;
    float m = 1.0f;
    int j;
    for (j = 0; j < i; j++){
      sum += m*k*data[i-j];
      m *= fac;
      if (m < tol) break;  // early exit validity depends on a variety of assumptions
      }
    if (j == i) sum += m*data[0];
    ema[i] = sum;
    }
}

void cpu_ema(int n, int period, float *data, float *ema){
  ema[0] = data[0];
  float k = 2.0f/(period+1);
  for (int i = 1; i < n; i++)
    ema[i] = data[i]*k + ema[i-1]*(1.0f-k);
}
int main(){
    /**
     * Function that computes a moving average on a vector
     */
    int N = 1<<20; // data size
    std::cout << "N = " << N << " bytes = " << N*sizeof(float) << std::endl;
    int period = 10; // moving average period

    // malloc'ed for stack usage instead of small heap size
    float *data = (float*)malloc(N*sizeof(float));
    float *ema = (float*)malloc(N*sizeof(float));
    float *gema = (float*)malloc(N*sizeof(float));

    float *d_data; // device pointer for data
    float *d_ema; // device pointer for exponential moving average

    // CUDA allocate memory for data, SMA, and EMA
    cudaMalloc(&d_data, N*sizeof(float));
    cudaMalloc(&d_ema, N*sizeof(float));

    // initialize data
    srand(time(0));
    data[0] = rand() % 100 + 50;
    for(int i = 1; i < N; i++){
        data[i] = data[i-1] + rand() % 11 - 5;
    }

    // copy data from host to device
    cudaMemcpy(d_data, data, N*sizeof(float), cudaMemcpyHostToDevice);

    // call device function
    long long gpu_t = dtime_usec(0);
    gpu_ema<<<(N+255)/256, 256>>>(N, 2.0f/(period+1), d_data, d_ema, 1e-7);
    cudaDeviceSynchronize();
    gpu_t = dtime_usec(gpu_t);
    long long cpu_t = dtime_usec(0);
    cpu_ema(N, period, data, ema);
    cpu_t = dtime_usec(cpu_t);
    if (N < 33)
      for (int i = 0; i < N; i++)
        std::cout << ema[i] << ",";
    std::cout << std::endl;

    cudaMemcpy(gema, d_ema, N*sizeof(float), cudaMemcpyDeviceToHost);
    cudaCheckErrors("some CUDA error");
    if (N < 33)
      for(int i = 0; i < N; i += 1)
          std::cout << gema[i] << ",";
    std::cout << std::endl;
    float max_err = fabs(gema[0] - ema[0])/ema[0];
    for (int i = 1; i < N; i++)
      max_err = max(max_err, fabs(gema[i] - ema[i])/ema[0]);
    std::cout << "max err: " << max_err*100.0 << "% final gpu: " << gema[N-1] << " final cpu: " << ema[N-1] << std::endl;
    std::cout << "cpu time: " << cpu_t/(float)USECPSEC << "s gpu time: " << gpu_t/(float)USECPSEC  << "s" << std::endl;
    cudaFree(d_data);
    cudaFree(d_ema);

    return 0;
}
$ nvcc -o t1444 t1444.cu
$ ./t1444
N = 1048576 bytes = 4194304


max err: 0.00218633% final gpu: 1311.38 final cpu: 1311.38
cpu time: 0.006346s gpu time: 0.000214s
$

Tesla V100, CUDA 10.1

To repeat, the validity of the above code with the performance-enhancing early-exit is dependent on range-limited input data. I'm not going to try to carefully cover the statistics, but if you have no idea of the statistics of your input data, then the above method may not be valid.

CUDA C/C++ accessing previous array element

Answers (1)

Related Questions