Seymoo
Seymoo

Reputation: 189

find strings with similar characters in group setting in R

> dput(mydf)
structure(list(pID = structure(c(69L, 69L, 69L, 69L, 69L, 69L, 
69L, 69L, 69L, 73L, 73L, 73L, 73L), .Label = c("S001", "S002", 
"S003", "S004", "S005", "S006", "S007", "S009", "S012", "S013", 
"S014", "S015", "S016", "S017", "S020", "S021", "S022", "S025", 
"S027", "S028", "S029", "S030", "S032", "S035", "S036", "S038", 
"S039", "S040", "S041", "S042", "S043", "S044", "S045", "S047", 
"S048", "S049", "S050", "S051", "S052", "S053", "S056", "S057", 
"S058", "S059", "S060", "S061", "S062", "S063", "S064", "S065", 
"S066", "S067", "S069", "S070", "S071", "S073", "S075", "S076", 
"S077", "S078", "S079", "S080", "S081", "S082", "S083", "S084", 
"S087", "S088", "S089", "S090", "S091", "S093", "S095", "S097", 
"S099", "S100", "S101", "S103", "S104", "S105", "S106", "S107", 
"S109", "S110", "S112", "S113", "S114", "S115", "S116", "S117", 
"S118", "S119", "S121", "S123", "S124", "S125", "S127", "S128", 
"S129", "S130", "S133", "S134", "S135", "S136", "S138", "S139", 
"S141", "S142", "S143", "S144", "S145", "S146", "S149", "S150", 
"S151", "S152", "S153", "S154", "S155", "S156", "S157", "S161", 
"S163", "S164", "S166", "S168", "S170", "S171", "S172", "S176", 
"S177", "S179", "S180", "S182", "S183", "S188", "S189", "S190", 
"S191", "S192", "S195", "S197", "S200", "S201", "S202", "S204", 
"S211", "S214", "S217", "S218", "S220", "S222", "S224", "S229", 
"S231", "S234", "S235", "S238", "S246", "S250", "S251", "S254", 
"S327", "S333", "S338", "S441", "S467", "S486", "S503", "S523", 
"S532"), class = "factor"), tID = structure(c(9L, 13L, 14L, 18L, 
23L, 27L, 28L, 10L, 19L, 8L, 14L, 17L, 23L), .Label = c("", "3T1_1", 
"3T3_1", "3T3_2", "3T4_1", "3T4_2", "T", "T1", "T1_1", "T1_2", 
"T1_3", "T1_4", "T11", "T2", "T2_1", "T2_2", "T3", "T3_1", "T3_2", 
"T4", "T4_1", "T4_2", "T5", "T5_1", "T5_2", "T6", "T8", "T9"), class = "factor"), 
    sID = structure(c(25L, 25L, 25L, 25L, 25L, 25L, 25L, NA, 
    NA, 27L, 27L, NA, 27L), .Label = c("", "P1", "P10", "P11", 
    "P12", "P13", "P14", "P15", "P16", "P17", "P18", "P19", "P2", 
    "P20", "P21", "P22", "P23", "P24", "P25", "P26", "P27", "P28", 
    "P29", "P3", "P30", "P31", "P32", "P33", "P34", "P35", "P36", 
    "P37", "P38", "P39", "P4", "P40", "P41", "P42", "P43", "P44", 
    "P45", "P5", "P6", "P7", "P8", "P9"), class = "factor")), class = "data.frame", row.names = c(NA, 
-13L))

In the data frame above I would like to do 2 things;

  1. In sID col fill the NA value with the same string as the rest of the rows in that col based on value in pID col. So all S089 will have P30 in sID.
  2. Make a new col, group based on pID and then mark each row as unique if tID value before _ is different otherwise repeated. So S089 will be marked repeated for T1_1, T1_2 and T3_1 and T3_2 and the rest of the rows will be marked unique

Should be possible with dplyer, I guess. Thanks a lot!

Upvotes: 0

Views: 98

Answers (1)

Ric S
Ric S

Reputation: 9247

With the premises that

  • inside each pID group the sID remains always equal
  • if a value in tID contains an underscore (_) then it's "repeated" (as this is what emerges from your sample data)

one possible solution using tidyr and dplyr is

library(tidyr)
library(dplyr)

mydf %>% 
  group_by(pID) %>% 
  fill(sID, .direction = "downup") %>% 
  mutate(new_col = ifelse(grepl("_", tID), "repeated", "unique"))

Output

# A tibble: 13 x 4
# Groups:   pID [2]
#    pID   tID   sID   new_col 
#    <fct> <fct> <fct> <chr>   
#  1 S089  T1_1  P30   repeated
#  2 S089  T11   P30   unique  
#  3 S089  T2    P30   unique  
#  4 S089  T3_1  P30   repeated
#  5 S089  T5    P30   unique  
#  6 S089  T8    P30   unique  
#  7 S089  T9    P30   unique  
#  8 S089  T1_2  P30   repeated
#  9 S089  T3_2  P30   repeated
# 10 S095  T1    P32   unique  
# 11 S095  T2    P32   unique  
# 12 S095  T3    P32   unique  
# 13 S095  T5    P32   unique

Upvotes: 1

Related Questions