user_n
user_n

Reputation: 53

Deleting groups that don't appear in every time period and data frame

I am cleaning data with multiple time periods in each data frame with multiple data frames. Each data frame has one year of data. I want to delete groups that do not appear in each time period(within the data frame), and delete groups that do not appear in each data frame. In other words, I want to keep groups that exist in every time period, across each data frame. I created data with an ID, time variable, and two variables that represent my data. My data will have more data frames, IDs, groups and variables as well.

t<-c(1,1,2,2,3,3,3,4,4,4)
id<-c(200,300,200,300,100,200,300,200,300,400)
x1<-rnorm(1:10)
x2<-rnorm(1:10)
?df
df<-data.frame(id,t,x1,x2)
t<-c(1,1,1,2,2,3,3,3,4,4)
id<-c(200,300,400,200,300,200,300,400,200,300)
x1<-rnorm(1:10)
x2<-rnorm(1:10)
df2<-data.frame(id,t,x1,x2)
id<-c(200,300,200,300,600,200,300,100,200,300)
t<-c(1,1,2,2,2,3,3,4,4,4)
x1<-rnorm(1:10)
x2<-rnorm(1:10)
df3<-data.frame(id,t,x1,x2)
rb<-rbind(df,df2,df3)
rb
cb<-cbind(df,df2,df3)
cb
    id t          x1          x2  id t            x1         x2  id t          x1            x2
1  200 1  0.37223136 -0.04918183 200 1  0.6489171399 -0.1324335 200 1 -0.41387676 -0.4566678425
2  300 1 -0.22062416  0.05150952 300 1 -0.3669090613  3.0826144 300 1  0.48237987 -0.0325861333
3  200 2  0.32912208  1.03922999 400 1  0.9347859735  0.1026632 200 2 -0.31308242 -0.3021501845
4  300 2 -0.18172302 -1.41669927 200 2  0.4814364147 -0.1087465 300 2 -1.52273626  0.6357750776
5  100 3 -0.81072008  0.64522238 300 2 -0.5676866296  0.2371230 600 2 -0.09687669  2.2883585934
6  200 3  0.45175343  0.64197622 200 3  0.0006852893  0.5830704 200 3  0.01726120 -0.5905109745
7  300 3  0.40465989 -0.70796588 300 3 -0.0008717189 -1.1969493 300 3 -0.18603781  0.3722390396
8  200 4  0.09852108 -1.76958443 400 3  0.9343534507 -1.3671447 100 4 -0.57308316  0.4749221706
9  300 4 -0.53951022  0.97306346 200 4  1.9176422485  0.9879788 200 4  0.40222133  0.3278821640
10 400 4  0.24271562 -1.37269617 300 4  1.4298971045  1.6095265 300 4  0.85799186  0.0006593401

My final out put would look like this:

id  t          x1          x2
200 1  0.37223136 -0.04918183 
300 1 -0.22062416  0.05150952
200 2  0.32912208  1.03922999
300 2 -0.18172302 -1.41669927
200 3  0.45175343  0.64197622
300 3  0.40465989 -0.70796588
200 4  0.09852108 -1.76958443
300 4 -0.53951022  0.97306346
200 1  0.6489171399 -0.1324335
300 1 -0.3669090613  3.0826144
200 2  0.4814364147 -0.1087465
300 2 -0.5676866296  0.2371230
200 3  0.0006852893  0.5830704
300 3 -0.0008717189 -1.1969493
200 4  1.9176422485  0.9879788
300 4  1.4298971045  1.6095265
200 1 -0.41387676 -0.4566678425
300 1  0.48237987 -0.0325861333
200 2 -0.31308242 -0.3021501845
300 2 -1.52273626  0.6357750776
200 3  0.01726120 -0.5905109745
300 3 -0.18603781  0.3722390396
200 4  0.40222133  0.3278821640
300 4  0.85799186  0.0006593401

Upvotes: 0

Views: 168

Answers (2)

nacnudus
nacnudus

Reputation: 6528

One strategy is to count the number of times each combination of id and t appears. If this is equals the maximum possible, then keep that id. (I used max to get the maximum possible combinations, but that only works if at least one id has appeared in every t.

I use adply from the plyr package here to replace your rbind step, because adply preserves information about which data frame each row came from (in the X1 column).

library(plyr)
rb <- adply(list(df, df2, df3), 1)

unique_combo <- unique(rb[, c("X1", "id", "t")])
##    X1  id t
## 1   1 200 1
## 2   1 300 1
## 3   1 200 2
## 4   1 300 2
## 5   1 100 3
## 6   1 200 3
## 7   1 300 3
## 8   1 200 4
## 9   1 300 4
## 10  1 400 4
## 11  2 200 1
## 12  2 300 1 etc.

combos_per_id <- aggregate(t ~ id, FUN = length, data = unique_combo)
##    id  t
## 1 100  2
## 2 200 12
## 3 300 12
## 4 400  3
## 5 600  1

ids_you_want <- subset(combos_per_id, t == max(t))
##    id  t
## 2 200 12
## 3 300 12

rb[rb$id %in% ids_you_want$id, ]
##    X1  id t          x1           x2
## 1   1 200 1  0.41800060 -0.729280896
## 2   1 300 1 -1.26310444  0.649438361
## 3   1 200 2  1.75130801  0.340464369
## 4   1 300 2 -0.47751518 -1.396611139
## 6   1 200 3 -0.11537438 -1.483654622
## 7   1 300 3 -1.33689508 -1.219725112 etc.

Edit to handle another column

library(plyr)
t<-c(1,1,2,2,3,3,3,4,4,4)
id<-c(200,300,200,300,100,200,300,200,300,400)
x1<-rnorm(1:10)
x2<-rnorm(1:10)
r<-c("b","a","a","a","a","a","a","a","a","a")
df<-data.frame(id,t,x1,x2, r)

t<-c(1,1,1,2,2,3,3,3,4,4)
id<-c(200,300,400,200,300,200,300,400,200,300)
x1<-rnorm(1:10)
x2<-rnorm(1:10)
r<-c("a","a","a","a","a","a","a","a","a","a")
df2<-data.frame(id,t,x1,x2, r)

id<-c(200,300,200,300,600,200,300,100,200,300)
t<-c(1,1,2,2,2,3,3,4,4,4)
x1<-rnorm(1:10)
x2<-rnorm(1:10)
r<-c("a","a","a","a","a","a","a","a","a","a")
df3<-data.frame(id,t,x1,x2, r)

rb <- adply(list(df, df2, df3), 1)
unique_combo <- unique(rb[, c("X1", "id", "t", "r")])
(combos_per_id <- aggregate(t ~ id + r, FUN = length, data = unique_combo))
ids_you_want <- subset(combos_per_id, t == max(t))
rb[rb$id %in% ids_you_want$id, ]

Upvotes: 1

Jthorpe
Jthorpe

Reputation: 10194

This is a bit brute force, but should work:

frames <- list(df,df2,df3)

lu <- function(x)
    length(unique(x))

timePeriodsPerDataframe <- sapply(frames,function(x)lu(x))

for(i in seq(length(frames))){
    appearsInEveryTimePeriod <- tapply(frames[[i]]$id,
                                       frames[[i]]$t,
                                       lu) == timePeriodsPerDataframe[i]
    if(i == 1)
        IDsInEveryTimePeriod <- names(tmp[tmp])
    else
        IDsInEveryTimePeriod <- intersect(names(tmp[tmp]),IDsInEveryTimePeriod)
}
IDsInEveryTimePeriod <- as.numeric(IDsInEveryTimePeriod)

Upvotes: 0

Related Questions