alberto
alberto

Reputation: 2645

Fast computing of co-occurrence matrix from N vectors with labels

I have a matrix that contains, in each of its N rows (iterations of a clustering algorithm), the cluster to which each of its M points (columns) belongs:

For instance:

data <- t(rmultinom(50, size = 7, prob = rep(0.1,10)))

     [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9] [,10]
 [1,]    0    0    0    2    1    1    0    2    1     0
 [2,]    3    1    2    0    0    0    0    1    0     0
 [3,]    0    1    2    1    0    0    0    0    2     1
 [4,]    0    1    1    0    2    0    0    2    0     1
 [5,]    3    0    0    0    2    1    0    0    0     1
 [6,]    0    1    2    0    0    1    1    2    0     0
 [7,]    0    1    0    1    0    1    1    2    1     0
 [8,]    3    0    0    2    0    0    0    1    0     1
 ...

I want to build a co-occurrence matrix where the position (i,j) is a sum of the number of times that two points have been seen in the same cluster through the different rows.

A naive approach would be:

  coincidences <- matrix(0, nrow=10, ncol=10)
  for (n in 1:50){ 
    for (m in 1:10){
      coincidences[m,] <- coincidences[m,] + as.numeric(data[n,m] == data[n,])
      }
    }

      [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9] [,10]
 [1,]   50   17   21   22   15   14   16   20   18    18
 [2,]   17   50   17   14   17   18   15   14   20    16
 [3,]   21   17   50   20   21   16   16   13   16    20
 [4,]   22   14   20   50   16   18   16   21   18    14
 [5,]   15   17   21   16   50   18   16   17   11    17
 [6,]   14   18   16   18   18   50   18   22   25    13
 [7,]   16   15   16   16   16   18   50   14   20    22
 [8,]   20   14   13   21   17   22   14   50   11    15
 [9,]   18   20   16   18   11   25   20   11   50    18
[10,]   18   16   20   14   17   13   22   15   18    50

How may I make it faster?

Extra: how can I plot it using ggplot2? (I have seen heatmap.2 in the gplots but I don't know if this is an overkill)

Upvotes: 2

Views: 304

Answers (2)

aeongrail
aeongrail

Reputation: 1384

Rcpp

Using a C++ implementation in R using the Rcpp package can get the job done possibly as fast as it will get

library(Rcpp)

data <- t(rmultinom(50, size = 7, prob = rep(0.1,10)))
    coincidences <- matrix(0, nrow=10, ncol=10)

#R implementation
fR<-function(data,coincidences){
for (n in 1:50){ 
    for (m in 1:10){

            coincidences[m,] <- coincidences[m,] + as.numeric(data[n,m] == data[n,])

    }

}
    return(coincidences)
}


#C++ Implementation
cppFunction('NumericMatrix fC(NumericMatrix data, NumericMatrix coincidences ) {

    int nrow = data.nrow(), ncol = coincidences.ncol();
    NumericMatrix out(nrow, ncol);
    int addon;


  for (int n = 0; n < nrow; n++) {
     for (int m = 0; m < ncol; m++) {
       for (int p = 0; p < nrow; p++) {

            if( data(n,m) == data(n,p) ){
                addon = 1;
            }else {
                addon = 0;
            }

            coincidences(m,p) = coincidences(m,p) + addon;


        }

    }

  }
  return coincidences;
}')

 #Call functions
 coincidences <- matrix(0, nrow=10, ncol=10)
 c1<-fC(data,coincidences)
 coincidences <- matrix(0, nrow=10, ncol=10)
 c2<-fR(data,coincidences)
 all.equal(c1,c2)
 > TRUE


 library(microbenchmark)
 microbenchmark(fC(data,coincidences),fR(data,coincidences))

>    Unit: microseconds
                       expr     min      lq      mean  median      uq     max neval
     fC(data, coincidences)   6.415   6.736   8.88454   7.698   8.660  74.727   100
     fR(data, coincidences) 283.514 290.089 301.84637 293.456 309.973 388.388   100

Edit

To plot:

library(reshape2)
C<-fC(data,coincidences)
ggplot(melt(C), aes(Var1,Var2, fill=value)) + geom_raster()

Upvotes: 4

fishtank
fishtank

Reputation: 3728

Faster way using vectorization and colSums:

> set.seed(1)
> data <- t(rmultinom(10000, size = 7, prob = rep(0.1,100)))
>
> system.time({
+  coincidences <- matrix(0, nrow=100, ncol=100)
+  for (n in 1:10000){
+    for (m in 1:100){
+      coincidences[m,] <- coincidences[m,] + as.numeric(data[n,m] == data[n,])
+      }
+    }}
+ )
   user  system elapsed
  9.692   0.000   9.708
>
> system.time(coincidences2<-sapply(1:ncol(data), function(i){ colSums(data[,i]==data) }))
   user  system elapsed
  0.676   0.096   0.774
>
> all.equal(coincidences2,coincidences)
[1] TRUE

Upvotes: 1

Related Questions