Alex
Alex

Reputation: 1304

Identifying missing observations in groups

I have some difficulties with my code, and I hope some of you could help.

The dataset looks something like this:

df <- data.frame("group" = c("A", "A", "A","A_1", "A_1", "B","B","B_1"), 
                 "id" = c("id1", "id2", "id3", "id2", "id3", "id5","id1","id1"), 
                 "time" = c(1,1,1,3,3,2,2,5),
                 "Val" = c(10,10,10,10,10,12,12,12))

"group" indicate the group the individual "id" is in. "A_1" indicate that a subject has left the group.

For instance, one subject "id1" leaves the "group A" that becomes group "A_1", where only "id2" and "id3" are members. Similarly "id5" leaves group B that becomes "B_1" with only id1 as a member.

What I would like to have in the final dataset is an opposite type of groups identification, that should look something like this:

final <- data.frame("group" = c("A", "A", "A","A_1", "B","B","B_1"), 
                     "id" = c("id1", "id2", "id3", "id1", "id5","id1","id5"), 
                     "time" = c(1,1,1,3,2,2,5),
                     "Val" = c(10,10,10,10,12,12,12),
                     "groupid" = c("A", "A", "A","A", "B","B","B"))

Whereby "A_1" and "B_1" only indicate the subjects, "id1" and "id5" respectively, that have left the original group, rather than identifying remaining subjects.

Does anyone have suggestions on how I could systematically do this?

I thank you in advance for your help.


Follow up:

My data is a little more complex that in the above example as there are multiple "exits" from treatements, moreover group identifier can be of different character leghts (here for instance AAA and B). The data looks more like the following:

df2 <- data.frame("group" = c("AAA", "AAA", "AAA","AAA","AAA_1","AAA_1", "AAA_1","AAA_2","AAA_2","B","B","B_1"), 
                  "id" = c("id1", "id2", "id3","id4", "id2", "id3","id4", "id2","id3", "id5","id1","id1"), 
                  "time" = c(1,1,1,1,3,3,3,6,6,2,2,5),
                  "Val" = c(10,10,10,10,10,10,10,10,10,12,12,12))

Where at time 3 id1 leaves groups AAA, that becomes groups AAA_1, while at time 6, also id4 leaves group AAA, that becomes group AAA_2. As discussed previously, i would like groups with "_" to identify those id that left the group rather than the one remaining. Hence the final dataset should look something like this:

final2 <- data.frame("group" = c("A", "A", "A","A","A_1","A_2",                     
                              "B","B","B_1"), 
                  "id" = c("id1", "id2", "id3","id4", "id1", "id4", "id5","id1","id5"), 
                  "time" = c(1,1,1,1,3,6,2,2,5),
                  "Val" = c(10,10,10,10,10,10,12,12,12))

thanks for helping me with this

Upvotes: 0

Views: 115

Answers (1)

s__
s__

Reputation: 9525

Ok you can try with dplyr in this way: maybe it's not elegant, but you get the result. The idea behind is to first fetch the ones that are in group ... but not in the relative ..._1 and change their group, fetch the others, and rbind them together:

library(dplyr)
# first you could find the one that are missing in the ..._1 groups
# and change their group to ..._1
    dups <-
           df %>%
           group_by(id, groupid = substr(group,1,1)) %>%
           filter(n() == 1)%>%
           mutate(group = paste0(group,'_1')) %>%
           left_join(df %>%
                     select(group, time, Val) %>%
                     distinct(), by ='group') %>% 
           select(group, id, time = time.y, Val = Val.y) %>%
           ungroup()

dups
# A tibble: 2 x 5
  groupid group id     time   Val
  <chr>   <chr> <fct> <dbl> <dbl>
1 A       A_1   id1       3    10
2 B       B_1   id5       5    12

# now you can select the ones that are in both groups:
dups2 <-
        df %>%
        filter(nchar(as.character(group)) == 1) %>%
        mutate(groupid = substr(group,1,1))

   dups2
  group  id time Val groupid
1     A id1    1  10       A
2     A id2    1  10       A
3     A id3    1  10       A
4     B id5    2  12       B
5     B id1    2  12       B

Last, rbind() them, arrange() them and order() the columns:

rbind(dups, dups2) %>%
 arrange(group) %>%
 select(group, id, time, Val, groupid)

# A tibble: 7 x 5
  group id     time   Val groupid
  <chr> <fct> <dbl> <dbl> <chr>  
1 A     id1       1    10 A      
2 A     id2       1    10 A      
3 A     id3       1    10 A      
4 A_1   id1       3    10 A      
5 B     id5       2    12 B      
6 B     id1       2    12 B      
7 B_1   id5       5    12 B 

Hope it helps!


EDIT:

You can generalize it with some work, here my attempt, hope it helps:

library(dplyr)
df3 <- df2

# you have to set a couple of fields you need:
df3$group <-ifelse(
  substr(df2$group,(nchar(as.character(df2$group))+1)-1,nchar(as.character(df2$group))) %in% c(0:9),
  paste0(substr(df2$group,1,1),"_",substr(df2$group,(nchar(as.character(df2$group))+1)-1,nchar(as.character(df2$group)))),
  paste0(substr(df2$group,1,1),"_0")
  )

df3$util <- as.numeric(substr(df3$group,3,3))+1

# two empty lists to populate with a nested loop:
changed <- list()
final_changed <- list()

Now first we find who changes, then the other: the idea is the same of the previous part:

for (j in c("A","B")) {
 df3_ <- df3[substr(df3$group,1,1)==j,] 
 for (i in unique(df3_$util)[1:length(unique(df3_$util))-1]) {
       temp1 <- df3_[df3_$util == i,]
       temp2 <- df3_[df3_$util == i+1,]
       changes <- temp1[!temp1$id %in% temp2$id,]
       changes$group <- paste0(j,'_',i )
       changes <- changes %>% left_join(temp2, by = 'group') %>% 
                  select(group , id = id.x, time = time.y, Val = Val.y)

     changed[[i]] <- changes
     }
  final_changed[[j]] <- changed
  }

change <- do.call(rbind,(do.call(Map, c(f = rbind, final_changed)))) %>% distinct()
change
  group  id time Val
1   A_1 id1    3  10
2   B_1 id5    5  12
3   A_2 id4    6  10

Then the remains, and put together:

remain <-
  df3 %>% mutate(group = gsub("_0", "", .$group)) %>%
  filter(nchar(as.character(group)) == 1) %>% select(-util)


rbind(change, remain) %>%
  mutate(groupid = substr(group,1,1)) %>% arrange(group) %>%
  select(group, id, time, Val, groupid)

  group  id time Val groupid
1     A id1    1  10       A
2     A id2    1  10       A
3     A id3    1  10       A
4     A id4    1  10       A
5   A_1 id1    3  10       A
6   A_2 id4    6  10       A
7     B id5    2  12       B
8     B id1    2  12       B
9   B_1 id5    5  12       B

Upvotes: 2

Related Questions