user3699775
user3699775

Reputation:

Improve performance of a construction of p-values matrix for a permutation test

I used an R code which implements a permutation test for the distributional comparison between two populations of functions. We have p univariate p-values.

The bottleneck is the construction of a matrix which contains all the possible CONTIGUOS p-values.
The last row of the matrix of p-values contain all the univariate p-values.
The penultimate row contains all the bivariate p-values in this order:
p_val_c(1,2), p_val_c(2,3), ..., p_val_c(p, 1) ...
The elements of the first row are coincident and the value associated is the p-value of the global test p_val_c(1,...,p)=p_val_c(2,...,p,1)=...=pval(p,1,...,p-1).

For computational reasons, I have decided to implement this component in c and use it in R with .C.

Here the code. The unique important part is the definition of the function Build_pval_asymm_matrix.

#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <string.h>
#include <time.h>

void Build_pval_asymm_matrix(int * p, int  * B, double  * pval, 
                             double * L, 
                             double * pval_asymm_matrix);
// Function used for the sorting of vector T_temp with qsort
int cmp(const void *x, const void *y);

int main() {
    int B = 1000; // number Conditional Monte Carlo (CMC) runs
    int p = 100; // number univariate tests

// Generate fictitiously data univariate p-values pval and matrix L. 
// The j-th column of L is the empirical survival 
// function of the statistics test associated to the j-th coefficient 
// of the basis expansion. The dimension of L is B * p.

// Generate pval
double pval[p];
memset(pval, 0, sizeof(pval)); // initialize all elements to 0
for (int i = 0; i < p; i++) {
    pval[i] = (double)rand() / (double)RAND_MAX;
}

// Construct L
double L[B * p];
// Inizialize to 0 the elements of L
memset(L, 0, sizeof(L)); 
// Array used to construct the columns of L
double temp_array[B];
memset(temp_array, 0, sizeof(temp_array)); 
for(int i = 0; i < B; i++) {
    temp_array[i] = (double) (i + 1) / (double) B;
}
for (int iter_coeff=0; iter_coeff < p; iter_coeff++) { 
// Shuffle temp_array
    if (B > 1) {
       for (int k = 0; k < B - 1; k++) 
       {
         int j = rand() % B;
         double t = temp_array[j];
         temp_array[j] = temp_array[k];
         temp_array[k] = t;
       }
   }
    for (int i=0; i<B; i++) {
        L[iter_coeff + p * i] = temp_array[i];  
    }   
}

double pval_asymm_matrix[p * p]; 
memset(pval_asymm_matrix, 0, sizeof(pval_asymm_matrix));
// Construct the asymmetric matrix of p-values
clock_t start, end;
double cpu_time_used;

start = clock();
Build_pval_asymm_matrix(&p, &B, pval, L, pval_asymm_matrix); 
end = clock();

cpu_time_used = ((double) (end - start)) / CLOCKS_PER_SEC;
printf("TOTAL CPU time used: %f\n", cpu_time_used);

return 0;
}

void Build_pval_asymm_matrix(int * p, int  * B, double  * pval, 
                             double * L, 
                             double * pval_asymm_matrix) {
    int nbasis = *p, iter_CMC = *B;
    // Scalar output fisher combining function applied on univariate
    // p-values
    double T0_temp = 0;
    // Vector output fisher combining function applied on a set of
    //columns of L
    double T_temp[iter_CMC];
    memset(T_temp, 0, sizeof(T_temp));

    // Counter for elements of T_temp greater than or equal to T0_temp
    int count = 0;
    // Indexes for columns of L
    int inf = 0, sup = 0; 

    // The last row of matrice_pval_asymm contains the univariate p-values
    for(int i = 0; i < nbasis; i++) {
        pval_asymm_matrix[i + nbasis * (nbasis - 1)] = pval[i];
     }

     // Construct the rows from bottom to up
     for (int row = nbasis - 2; row >= 0; row--) {
         for (int col = 0; col <= row; col++) {
             T0_temp = 0;
             memset(T_temp, 0, sizeof(T_temp));
             inf = col;
             sup = (nbasis - row) + col - 1;
             // Combining function Fisher applied on 
             // p-values pval[inf:sup] 
             for (int k = inf; k <= sup; k++) {
                 T0_temp += log(pval[k]);
             }
             T0_temp *= -2;

             // Combining function Fisher applied 
             // on columns inf:sup of matrix L
             for (int k = 0; k < iter_CMC; k++) {
                 for (int l = inf; l <= sup; l++) {
                     T_temp[k] += log(L[l + nbasis * k]);
                 }
                 T_temp[k] *= -2;
             }  

             // Sort the vector T_temp
             qsort(T_temp, iter_CMC, sizeof(double), cmp);

             // Count the number of elements of T_temp less than T0_temp
             int h = 0;
             while (h < iter_CMC && T_temp[h] < T0_temp) {
                 h++;
             }
// Number of elements of T_temp greater than or equal to T0_temp
count = iter_CMC - h;
pval_asymm_matrix[col + nbasis * row] = (double) count / (double)iter_CMC;
}
// auxiliary variable for columns of L inf:nbasis-1 and 1:sup
int aux_first = 0, aux_second = 0;  
int num_col_needed = 0;

for (int col = row + 1; col < nbasis; col++) {
  T0_temp = 0;
  memset(T_temp, 0, sizeof(T_temp));

  inf = col;
  sup = ((nbasis - row) + col) % nbasis - 1;

  // Useful indexes 
  num_col_needed = nbasis - inf + sup + 1;
  int index_needed[num_col_needed];
  memset(index_needed, -1, num_col_needed * sizeof(int));

  aux_first = inf;
  for (int i = 0; i < nbasis - inf; i++) {
      index_needed[i] = aux_first;
      aux_first++;
  }
  aux_second = 0;
  for (int j = 0; j < sup + 1; j++) {
      index_needed[j + nbasis - inf] = aux_second;
      aux_second++;
  }

  // Combining function Fisher applied on p-values 
  // pval[inf:p-1] and pval[0:sup-1]1]
  for (int k = 0; k < num_col_needed; k++) {
        T0_temp += log(pval[index_needed[k]]);
  }
  T0_temp *= -2;

  // Combining function Fisher applied on columns inf:p-1 and 0:sup-1
  // of matrix L
  for (int k = 0; k < iter_CMC; k++) {
    for (int l = 0; l < num_col_needed; l++) {
        T_temp[k] += log(L[index_needed[l] + nbasis * k]);
    }
    T_temp[k] *= -2;
  } 

  // Sort the vector T_temp
  qsort(T_temp, iter_CMC, sizeof(double), cmp);

 // Count the number of elements of T_temp less than T0_temp
  int h = 0;
  while (h < iter_CMC && T_temp[h] < T0_temp) {
      h++;
  }
// Number of elements of T_temp greater than or equal to T0_temp
count = iter_CMC - h;  
pval_asymm_matrix[col + nbasis * row] = (double) count / (double)iter_CMC;
  } // end for over col from row + 1 to nbasis - 1
 } // end for over rows of asymm p-values matrix except the last row
}

int cmp(const void *x, const void *y)
{
  double xx = *(double*)x, yy = *(double*)y;
  if (xx < yy) return -1;
  if (xx > yy) return  1;
  return 0;
}

Here the times of execution in seconds measured in R:

time_original_function
user system elapsed
79.726 1.980 112.817

time_function_double_for
user system elapsed
79.013 1.666 89.411

time_c_function
user system elapsed
47.920 0.024 56.096

The first measure was obtained using an equivalent R function with duplication of the vector pval and matrix L.
What I wanted to ask is some suggestions in order to decrease the execution time with the C function for simulation purposes. The last time I used c was five years ago and consequently there is room for improvement. For instance I sort the vector T_temp with qsort in order to compute in linear time with a while the number of elements of T_temp greater than or equal to T0_temp. Maybe this task could be done in a more efficient way. Thanks in advance!!

Upvotes: 1

Views: 108

Answers (1)

user4842163
user4842163

Reputation:

I reduced the input size to p to 50 to avoid waiting on it (don't have such a fast machine) -- keeping p as is and reducing B to 100 has a similar effect, but profiling it showed that ~7.5 out of the ~8 seconds used to compute this was spent in the log function.

qsort doesn't even show up as a real hotspot. This test seems to headbutt the machine more in terms of micro-efficiency than anything else.

So unless your compiler has a vastly faster implementation of log than I do, my first suggestion is to find a fast log implementation if you can afford some accuracy loss (there are ones out there that can compute log over an order of magnitude faster with precision loss in the range of ~3% or so).

If you cannot have precision loss and accuracy is critical, then I'd suggest trying to memoize the values you use for log if you can and store them into a lookup table.

Update

I tried the latter approach.

// Create a memoized table of log values.
double log_cache[B * p];
for (int j=0, num=B*p; j < num; ++j)
    log_cache[j] = log(L[j]);

Using malloc might be better here, as we're pushing rather large data to the stack and could risk overflows.

Then pass her into Build_pval_asymm_matrix.

Replace these:

T_temp[k] += log(L[l + nbasis * k]);
...
T_temp[k] += log(L[index_needed[l] + nbasis * k]);

With these:

T_temp[k] += log_cache[l + nbasis * k];
...
T_temp[k] += log_cache[index_needed[l] + nbasis * k];

This improved the times for me from ~8 seconds to ~5.3 seconds, but we've exchanged the computational overhead of log for memory overhead which isn't that much better (in fact, it rarely is but calling log for double-precision floats is apparently quite expensive, enough to make this exchange worthwhile). The next iteration, if you want more speed, and it is very possible, involves looking into cache efficiency.

For this kind of huge matrix stuff, focusing on memory layouts and access patterns can work wonders.

Upvotes: 1

Related Questions