wake_wake
wake_wake

Reputation: 1204

R - count similar values in pair-wise comparing groups in data.frame

I have observational data for basketball players. Each ID represents a player.

df <- data.frame(id = c("A", "B", "c"),
                  V1 = c(1, 3, 2),
                  V2 = c(1, 2, 2),
                  V3 = c(3, 1, NA))
df
  id V1 V2 V3
1  A  1  1  3
2  B  3  2  1
3  c  2  2 NA

I want to pair-wise compare all players and count the number of similarities among their variables.

It doesn't matter if the values are found in different columns. Note that some players have NA in some fields.

The desired outcome should look something like this:

desired <- data.frame(id_x = c("A", "A", "B"),
                      id_y = c("B", "C", "C"),
                      similar = c(2, 0, 1))
desired
  id_x id_y similar
1    A    B       2
2    A    C       0
3    B    C       1

The real data consists of tens of thousands of players, so performance is important too.

Any pointers are much appreciated.

Upvotes: 3

Views: 104

Answers (4)

Alexis
Alexis

Reputation: 5059

Maybe you can also use proxy for this problem:

library(proxy)

df <- data.frame(id = c("A", "B", "c"),
                 V1 = c(1, 3, 2),
                 V2 = c(1, 2, 2),
                 V3 = c(3, 1, NA))

myfun <- function(x, y) {
  sum(unique(setdiff(x, NA)) %in% y)
}

pr_DB$set_entry(FUN=myfun, names="myfun", distance=FALSE, loop=TRUE)

similar <- proxy::simil(df[, -1L], method="myfun")

res <- combn(df$id, 2L)
res <- data.frame(id_x=res[2L,], id_y=res[1L,])
res$similar <- as.integer(similar)

print(res)
  id_x id_y similar
1    B    A       2
2    c    A       0
3    c    B       1

It does seem somewhat faster on my machine:

df <- data.frame(id = sample(10e2),
                 V1 = sample(1:15, 10e2, replace = TRUE),
                 V2 = sample(2:16, 10e2, replace = TRUE),
                 V3 = sample(3:17, 10e2, replace = TRUE))

system.time({
  similar <- proxy::simil(df[, -1L], method="myfun")

  res <- combn(df$id, 2L)
  res <- data.frame(id_x=res[2L,], id_y=res[1L,])
  res$similar <- as.integer(similar)
})

   user  system elapsed 
   7.84    0.05    7.92

Upvotes: 1

M--
M--

Reputation: 28825

We can make a list of each pair of rows and use that to find the intersect of them. Look below:

toCheck <- combn(rownames(df), 2, simplify = FALSE)
names(toCheck) <-
  sapply(toCheck, paste, collapse = "&") 

sapply(toCheck, function(x){
  length(base::intersect(as.list(df[x[1],-1]), as.list(df[x[2],-1])))
})

# 1&2 1&3 2&3 
#   2   0   1

Testing on your larger dataset:

set.seed(45)
df2 <- data.frame(ID = sample(10e2),
                  V1 = sample(1:15, 10e2, replace = TRUE),
                  V2 = sample(1:16, 10e2, replace = TRUE),
                  V3 = sample(1:17, 10e2, replace = TRUE)) 

M_M_approach <- function(mdf) {
  Check <- combn(rownames(mdf), 2, simplify = FALSE)
  names(Check) <-  sapply(Check, paste, collapse = "&")

  sapply(Check, function(x){
    length(base::intersect(as.list(mdf[x[1],-1]), as.list(mdf[x[2],-1]))) })
}

M_M_approach(df2)
# 1&2 1&3 2&3 
#   1   1   2 

microbenchmark::microbenchmark(M_M_approach = M_M_approach(df2), times = 5)
# Unit: milliseconds
#          expr      min       lq     mean   median       uq      max neval
#  M_M_approach 225.6985 228.1924 248.5623 250.4814 255.1007 283.3385     5

Upvotes: 1

Onyambu
Onyambu

Reputation: 79208

Will give two different approaches:

 A=lapply(apply(df[-1],1,list),unlist)
 combn(A,2,function(x)sum(unique(na.omit(x[[1]]))%in%unique(na.omit(x[[2]]))))
  [1] 2 0 1

or

 B=apply(df[-1],1,function(x)apply(df[-1],1,function(y)sum(unique(na.omit(x))%in%unique(na.omit(y)))))
 B[lower.tri(B)]
 [1] 2 0 1

Upvotes: 1

wake_wake
wake_wake

Reputation: 1204

I'm benchmarking the great answers provided by Onyambu.

Make a larger test-sample:

df2 <- data.frame(ID = sample(10e2),
                 V1 = sample(1:15, 10e2, replace = TRUE),
                 V2 = sample(2:16, 10e2, replace = TRUE),
                 V3 = sample(3:17, 10e2, replace = TRUE))

Run benchmark:

library(microbenchmark)
bench <- microbenchmark(

# option A
A=lapply(apply(df2[-1],1,list),unlist),
A1=combn(A,2,function(x)sum(unique(x[[1]])%in%unique(x[[2]]))),

# option B
B=apply(my.df2[-1],1,function(x)apply(df2[-1],1,function(y)sum(unique(x)%in%uni
  que(y)))),
B2= B[lower.tri(B)],

# repeat 5 times
times=5)

Produces:

bench

Unit: milliseconds
 expr      min          lq        mean      median          uq         max neval cld
A     10.44847    10.83849    11.79438    11.33756    11.34568    15.00171     5 a  
A1 25420.53573 25735.88333 26721.22973 25802.89428 26658.98114 29987.85417     5  b 
B  52173.85540 52519.34839 53327.35931 52661.64372 54508.70321 54773.24582     5   c
B2    33.43663    34.16278    34.91674    35.19001    35.81182    35.98246     5 a  

The original data is larger.

Are there options with greater performance?

Upvotes: 0

Related Questions