rm1512
rm1512

Reputation: 11

count unique row combinations in R dataframe

I have an original dataframe that contains a unique value in the first column, with X (no minimum or maximum) rows following that contains values pertaining to the unique value in the first column. I'm looking to find the number of unique pairs of values from columns 2-inf. for each row. The data looks like the dataframe below but on a larger scale. For example, "w" and "x" have appeared in the same row 4 times (rows 1, 3, 5, 7), "y" and "z" have appeared together in the same row twice (1, 6). I would like to count the number of unique pairs for all rows.

      x$`Order ID` 1    2    3    4    5
    1            1 w    x    y    z <NA>
    2            2 x    y <NA> <NA> <NA>
    3            3 u    v    w    x    y
    4            4 w <NA> <NA> <NA> <NA>
    5            5 w    x    y <NA> <NA>
    6            6 y    z <NA> <NA> <NA>
    7            7 t    u    v    w    x

I have tried to start by using setDT() from the data.table library to count the number of unique rows in general, and would then break it down to pairs of unique values within the rows, but get an error that the lengths of the rows are not equal ( because they are not). I am not well versed in R but an wondering what function can be used to do so. Thank you.

EDIT: I also tried the count() function from the plyr library. This is on the right track, but only counts the entire unique rows, not the individual unique pairs within the rows.

    library(plyr)
    count(transposed[, -1])

      X1   X2   X3   X4   X5 freq
    1  t    u    v    w    x    1
    2  u    v    w    x    y    1
    3  w    x    y    z <NA>    1
    4  w    x    y <NA> <NA>    1
    5  w <NA> <NA> <NA> <NA>    1
    6  x    y <NA> <NA> <NA>    1
    7  y    z <NA> <NA> <NA>    1

EDIT 2:

Desired output from this subset of data:

       Pair Frequency
    1    tu         1
    2    tv         1
    3    tw         1
    4    tx         1
    5    uv         2
    6    uw         2
    7    ux         2
    8    uy         1
    9    vw         2
    10   vx         2
    11   vy         1
    12   wx         4
    13   wy         3
    14   wz         1
    15   xy         4
    16   xz         1
    17   yz         2

Counting the number of different 2 pair combinations in each row, and then counting the number of times those combinations appear throughout the entire dataset. Order does not matter (xy = yx).

Upvotes: 0

Views: 938

Answers (2)

Obim
Obim

Reputation: 136

I think something like this might work for you.

library(dplyr)
# Go through each row and count combo occurrences
data.df$counts <- NA
for (i in 1:nrow(data.df)){
  values <- unlist(data.df[i,-c(1, nrow(data.df))]) %>% 
    as.character() %>% 
    subset(!(. %in% "<NA>")) %>%
    unique()
  # if there are enough unique values to make at least one combo, count it:
  if(length(values) > 1) {
    values <- combn(x = values, m = 2)
    data.df$counts[i] <- sapply(
      X = 1:ncol(values), 
      FUN = function(x){paste(as.character(values[,x]), collapse = "")}
    ) %>% length
  } else {
    data.df$counts[i] <- 0
  }
}

EDIT I see you want a tally for each pair instead:

# Make data.frame of all possible unique combos
combos <- unlist(data.df[,-1]) %>% 
  as.character() %>% 
  subset(!(. %in% c("<NA>", ""))) %>%
  unique() %>%
  as.data.frame(x = combn(x = ., m = 2))
combos.df <- data.frame(
  combo = sapply(
    X = 1:ncol(combos), 
    FUN = function(x){paste(as.character(combos[,x]), collapse = "")}), 
  freq = 0)

# Go through each row and count combo occurences
for (i in 1:nrow(data.df)){
  values <- unlist(data.df[i,-c(1, nrow(data.df))]) %>% 
    as.character() %>% 
    subset(!(. %in% "<NA>")) %>%
    unique()
  # if there are enough unique values to make at least one combo, count it:
  if(length(values) > 1) {
    values <- combn(x = values, m = 2)
    values <- sapply(
      X = 1:ncol(values), 
      FUN = function(x){paste(as.character(values[,x]), collapse = "")})
    for(j in 1:nrow(combos.df)){
      if(combos.df$combo[j] %in% values){
        combos.df$freq[j] <- as.numeric(as.character(combos.df$freq[j])) + 1
      }
    }
  }
}
> combos.df
   combo freq
1     wx    4
2     wu    0
3     wy    3
4     wt    0
5     wv    0
6     wz    1
7     xu    0
8     xy    4
9     xt    0
10    xv    0
11    xz    1
12    uy    1
13    ut    0
14    uv    2
15    uz    0
16    yt    0
17    yv    0
18    yz    2
19    tv    1
20    tz    0
21    vz    0

Upvotes: 0

r2evans
r2evans

Reputation: 160407

Try this,

combs <- t(combn(sort(na.omit(unique(unlist(dat[,-1])))), 2))
head(combs, n=7)
#      [,1] [,2]
# [1,] "t"  "u" 
# [2,] "t"  "v" 
# [3,] "t"  "w" 
# [4,] "t"  "x" 
# [5,] "t"  "y" 
# [6,] "t"  "z" 
# [7,] "u"  "v" 
freqs <- apply(combs, 1, function(C) {
  sum(apply(dat[,-1], 1, function(a) all(C %in% a, na.rm = TRUE)))
})
combsDF <- as.data.frame(combs)
combsDF$freq <- freqs
combsDF
#    V1 V2 freq
# 1   t  u    1
# 2   t  v    1
# 3   t  w    1
# 4   t  x    1
# 5   t  y    0
# 6   t  z    0
# 7   u  v    2
# 8   u  w    2
# 9   u  x    2
# 10  u  y    1
# 11  u  z    0
# 12  v  w    2
# 13  v  x    2
# 14  v  y    1
# 15  v  z    0
# 16  w  x    4
# 17  w  y    3
# 18  w  z    1
# 19  x  y    4
# 20  x  z    1
# 21  y  z    2

If you only want rows with positive matches, then

combsDF[ combsDF$freq > 0, ]
#    V1 V2 freq
# 1   t  u    1
# 2   t  v    1
# 3   t  w    1
# 4   t  x    1
# 7   u  v    2
# 8   u  w    2
# 9   u  x    2
# 10  u  y    1
# 12  v  w    2
# 13  v  x    2
# 14  v  y    1
# 16  w  x    4
# 17  w  y    3
# 18  w  z    1
# 19  x  y    4
# 20  x  z    1
# 21  y  z    2

This could likely easily be extended to include more than just "2" combinations.

Upvotes: 1

Related Questions