chippycentra
chippycentra

Reputation: 3432

Complex merging between rows when sharing values with other groups in R

Hello everyone I have a dataframe such as ::

  Families Species   Event      Groups
1  A,B,C,D     SP1 4,5,6,1 G1,G2,G3,G4
2    A,A,C     SP2   4,4,2       G1,G4
3        B     SP2       5          G1
4        A     SP3       2          G8
5        C     SP3       1          G9

and the idea is that I would like to merge rows by Species when :

So for instance we see for SP2 row2 that :

AND

for SP2 row3 that :

then I can merge the rows and row3 since they both share values with the row1 of SP1.

so here is the expected output:

  Families Species   Event      Groups
1  A,B,C,D     SP1 4,5,6,1 G1,G2,G3,G4
2  A,A,C,B SP2,SP2 4,4,2,5    G1,G4,G1
3        A     SP3       2          G8
4        C     SP3       1          G9

If someone has an idea it would be amazing, thank you very much for your help and time.

Here are the data :

    structure(list(Families = structure(c(3L, 2L, 4L, 1L, 5L), .Label = c("A", 
    "A,A,C", "A,B,C,D", "B", "C"), class = "factor"), Species = structure(c(1L, 
    2L, 2L, 3L, 3L), .Label = c("SP1", "SP2", "SP3"), class = "factor"), 
        Event = structure(c(4L, 3L, 5L, 2L, 1L), .Label = c("1", 
        "2", "4,4,2", "4,5,6,1", "5"), class = "factor"), Groups = structure(c(2L, 
        3L, 1L, 4L, 5L), .Label = c("G1", "G1,G2,G3,G4", "G1,G4", 
        "G8", "G9"), class = "factor")), class = "data.frame", row.names = c(NA, 
    -5L))

Upvotes: 1

Views: 78

Answers (1)

MrSmithGoesToWashington
MrSmithGoesToWashington

Reputation: 1076

here is an attempt, using data.table (but if you look a it, it's pretty possbile to do it without data.table, it's just that data.table is now my mother tongue) :

dt <-  structure(list(Families = structure(c(3L, 2L, 4L, 1L, 5L), .Label = c("A", 
                                                                             "A,A,C", "A,B,C,D", "B", "C"), class = "factor"), Species = structure(c(1L, 
                                                                                                                                                     2L, 2L, 3L, 3L), .Label = c("SP1", "SP2", "SP3"), class = "factor"), 
                      Event = structure(c(4L, 3L, 5L, 2L, 1L), .Label = c("1", 
                                                                          "2", "4,4,2", "4,5,6,1", "5"), class = "factor"), Groups = structure(c(2L, 
                                                                                                                                                 3L, 1L, 4L, 5L), .Label = c("G1", "G1,G2,G3,G4", "G1,G4", 
                                                                                                                                                                             "G8", "G9"), class = "factor")), class = "data.frame", row.names = c(NA, 
                                                                                                                                                                                                                                                  -5L))
library(data.table)                                                                                                                                                                                                                                           
setDT(dt)
library(stringr)

dt[, Families := as.character(Families)]
dt[, Event := as.character(Event)]
dt[, Groups := as.character(Groups)]
dt[, n := row.names(.SD) ]

dt2 <- merge(dt, 
      dt, 
      by = "Species")[n.x >= n.y]

dt2[, testFamilies := length(intersect(unlist(strsplit(Families.x, ",")), unlist(strsplit(Families.y, ",")))) > 0, by = 1:nrow(dt2)]
dt2[, testEvents := length(intersect(unlist(strsplit(Event.x, ",")), unlist(strsplit(Event.y, ",")))) > 0, by = 1:nrow(dt2)]
dt2[, testGroups := length(intersect(unlist(strsplit(Groups.x, ",")), unlist(strsplit(Groups.y, ",")))) > 0, by = 1:nrow(dt2)]

dt2[, Families := paste0(unique(unlist(strsplit(paste(Families.x, Families.y, sep = ","), ","))), collapse = ","), by = 1:nrow(dt2)]
dt2[, Event := paste0(unique(unlist(strsplit(paste(Event.x, Event.y, sep = ","), ","))), collapse = ","), by = 1:nrow(dt2)]
dt2[, Groups := paste0(unique(unlist(strsplit(paste(Groups.x, Groups.y, sep = ","), ","))), collapse = ","), by = 1:nrow(dt2)]

dt2[(testFamilies == TRUE | testGroups == TRUE | testEvents == TRUE)
    & !n.x %in% dt2[(testFamilies == TRUE | testGroups == TRUE | testEvents == TRUE) & n.x != n.y, n.y]
    & !n.y %in% dt2[(testFamilies == TRUE | testGroups == TRUE | testEvents == TRUE) & n.x != n.y, n.x], 
                     .(Species, Families, Event, Groups)]



Upvotes: 1

Related Questions