Luisa Anaya
Luisa Anaya

Reputation: 65

repeating apply with different combination of cols

I can´t nest an apply funtion to repeat it many times with other combination of columns

I need to get a percentage of sp1==1 & s1==1and sp2==1 & s1==1 regarding s1, and in the same way regarding s2, s3... s1000. Here a short example:

x <- data.frame("sp1"=rep(0:1, times=5),
                "sp2"=rep(0:1, each=5),
                "s1" = rep(0:1, times=10),
                "s2" = rep(0:1, each=2),
                "s3" = rep(1:0, each=2))
> x
   sp1 sp2 s1 s2 s3
1    0   0  0  0  1
2    1   0  1  0  1
3    0   0  0  1  0
4    1   0  1  1  0
5    0   0  0  0  1
6    1   1  1  0  1
7    0   1  0  1  0
8    1   1  1  1  0
9    0   1  0  0  1
10   1   1  1  0  1
11   0   0  0  1  0
12   1   0  1  1  0
13   0   0  0  0  1
14   1   0  1  0  1
15   0   0  0  1  0
16   1   1  1  1  0
17   0   1  0  0  1
18   1   1  1  0  1
19   0   1  0  1  0
20   1   1  1  1  0

Now I typed a function to calculate percentage regarding s1:

r <- as.data.frame(sapply(x[,1:2],
                          function(i) sum(i ==1 & x$s1 == 1)/sum(i ==1)))
> r
    sapply(x[, 1:2], function(i) sum(i == 1 & x$s1 == 1)/sum(i == 1))
sp1                                                               1.0
sp2                                                               0.6

I want to built a df with all percentages of sp1, sp2, sp3, ...sp200 regarding s1, s2, s3, ...s1000...

> r
      s1   s2   s3 ... s1000
sp1   1.0  0.5  0.5
sp2   0.6  0.5  0.5
...
sp200

I've tried to do a function with both groups-variables, one for sp's and another for s's:

intento <- as.data.frame(sapply(i=x[,1:2], 
                                j=x[,3:5], 
                                function(i,j)sum(i ==1 & j == 1)/sum(i ==1)))

But logically that´s not the way:

Error in match.fun(FUN) : argument "FUN" is missing, with no default

Upvotes: 3

Views: 85

Answers (4)

Cole
Cole

Reputation: 11255

A similar approach is to use lapply(x, function(x) which(x == 1) and then use that down the road. The thought process being that we might as well store the information instead of repeatedly checking it.

#as suggested by @Ronak
sp_cols <- grep("^sp", names(x))
s_cols <- grep("^s\\d+", names(x))

x_l_zero <- lapply(x, function(x) which(x == 1))
sapply(x_l_zero[s_cols]
       , function(x)  sapply(x_l_zero[sp_cols]
                             , function(y) length(intersect(x,y))/length(y)))

     s1  s2  s3
sp1 1.0 0.5 0.5
sp2 0.6 0.5 0.5

@Ronak has the fastest solution and is more-or-less the OP's code that's been addressed.

Unit: microseconds
             expr    min      lq     mean  median      uq     max neval
 jay.sf_outer_FUN 1190.8 1240.85 1360.103 1284.50 1337.30  2627.0   100
 cole_which_apply  268.4  289.00  454.609  306.05  322.00  7610.7   100
 ronak_1_unsimple  181.3  193.95  321.863  209.95  233.40  6227.4   100
   ronak_2_simple  228.5  241.25  342.354  250.65  276.05  7478.4   100
      akrun_dplyr 5218.7 5506.05 6108.997 5721.80 6081.65 25147.3   100

Code for performance:

library(microbenchmark)
library(tidyverse)
##data set
x <- data.frame("sp1"=rep(0:1, times=5),
                "sp2"=rep(0:1, each=5),
                "s1" = rep(0:1, times=10),
                "s2" = rep(0:1, each=2),
                "s3" = rep(1:0, each=2))

#for jay.sf
FUN <- Vectorize(function(i,j) sum(x[i] == 1 & x[j] == 1)/sum(x[i] == 1))

#names of columns
sp_cols <- grep("^sp", names(x))
s_cols <- grep("^s\\d+", names(x))

sp_cols_nam <- grep("^sp", names(x), value = T)
s_cols_nam <- grep("^s\\d+", names(x), value = T)

#benchmark
microbenchmark(
  outer_FUN = {
    outer(sp_cols, s_cols, FUN)
  }
  , apply_heaven = {
    x_l_zero <- lapply(x, function(x) which(x == 1))
    sapply(x_l_zero[s_cols], function(x)  sapply(x_l_zero[sp_cols] , function(y) length(intersect(x,y))/length(y)))
  }
  , ronak_1_unsimple = {
    sapply(x[sp_cols], function(i) sapply(x[s_cols], 
                                            function(j) sum(i == 1 & j == 1)/sum(i == 1)))
  }
  , ronak_2_simple = {
    sapply(x[s_cols], function(i) sapply(x[sp_cols], function(j) sum(i & j)/sum(j)))
  }
  , akrun_dplyr = {
    crossing(nm1 = sp_cols_nam,  
             nm2 = s_cols_nam) %>%
      mutate(val = pmap_dbl(., ~ sum(x[..1] ==1 & x[..2] == 1)/sum(x[..1]))) %>%
      spread(nm2, val)
  }
)

Upvotes: 2

jay.sf
jay.sf

Reputation: 73572

You're looking for outer. Your function just needs to be Vectorized.

FUN <- Vectorize(function(i,j) sum(x[i] == 1 & x[j] == 1)/sum(x[i] == 1))

outer(1:2, 3:5, FUN)
#      [,1] [,2] [,3]
# [1,]  1.0  0.5  0.5
# [2,]  0.6  0.5  0.5

You could refine this using grep to find the columns automatically

outer(grep("sp", names(x)), grep("s\\d+", names(x)), FUN)

Upvotes: 2

akrun
akrun

Reputation: 887721

Here is an option with tidyverse

library(tidyverse)
crossing(nm1 = names(x)[startsWith(names(x), "sp")],  
        nm2 = grep("^s\\d+", names(x), value = TRUE)) %>%
    mutate(val = pmap_dbl(., ~ sum(x[..1] ==1 & x[..2] == 1)/sum(x[..1]))) %>%
    spread(nm2, val)
# A tibble: 2 x 4
#  nm1      s1    s2    s3
#  <chr> <dbl> <dbl> <dbl>
#1 sp1     1     0.5   0.5
#2 sp2     0.6   0.5   0.5

Upvotes: 0

Ronak Shah
Ronak Shah

Reputation: 389215

We can seperate the columns based on their names and use sapply on them

sp_cols <- grep("^sp", names(x))
s_cols <- grep("^s\\d+", names(x))

sapply(x[sp_cols], function(i) sapply(x[s_cols], 
                        function(j) sum(i == 1 & j == 1)/sum(i == 1)))

If you have only 1 and 0's as values in the columns this can be reduced to

sapply(x[s_cols], function(i) sapply(x[sp_cols], function(j) sum(i & j)/sum(j)))

#     s1  s2  s3
#sp1 1.0 0.5 0.5
#sp2 0.6 0.5 0.5

Upvotes: 2

Related Questions