Reputation: 53
I am cleaning data with multiple time periods in each data frame with multiple data frames. Each data frame has one year of data. I want to delete groups that do not appear in each time period(within the data frame), and delete groups that do not appear in each data frame. In other words, I want to keep groups that exist in every time period, across each data frame. I created data with an ID, time variable, and two variables that represent my data. My data will have more data frames, IDs, groups and variables as well.
t<-c(1,1,2,2,3,3,3,4,4,4)
id<-c(200,300,200,300,100,200,300,200,300,400)
x1<-rnorm(1:10)
x2<-rnorm(1:10)
?df
df<-data.frame(id,t,x1,x2)
t<-c(1,1,1,2,2,3,3,3,4,4)
id<-c(200,300,400,200,300,200,300,400,200,300)
x1<-rnorm(1:10)
x2<-rnorm(1:10)
df2<-data.frame(id,t,x1,x2)
id<-c(200,300,200,300,600,200,300,100,200,300)
t<-c(1,1,2,2,2,3,3,4,4,4)
x1<-rnorm(1:10)
x2<-rnorm(1:10)
df3<-data.frame(id,t,x1,x2)
rb<-rbind(df,df2,df3)
rb
cb<-cbind(df,df2,df3)
cb
id t x1 x2 id t x1 x2 id t x1 x2
1 200 1 0.37223136 -0.04918183 200 1 0.6489171399 -0.1324335 200 1 -0.41387676 -0.4566678425
2 300 1 -0.22062416 0.05150952 300 1 -0.3669090613 3.0826144 300 1 0.48237987 -0.0325861333
3 200 2 0.32912208 1.03922999 400 1 0.9347859735 0.1026632 200 2 -0.31308242 -0.3021501845
4 300 2 -0.18172302 -1.41669927 200 2 0.4814364147 -0.1087465 300 2 -1.52273626 0.6357750776
5 100 3 -0.81072008 0.64522238 300 2 -0.5676866296 0.2371230 600 2 -0.09687669 2.2883585934
6 200 3 0.45175343 0.64197622 200 3 0.0006852893 0.5830704 200 3 0.01726120 -0.5905109745
7 300 3 0.40465989 -0.70796588 300 3 -0.0008717189 -1.1969493 300 3 -0.18603781 0.3722390396
8 200 4 0.09852108 -1.76958443 400 3 0.9343534507 -1.3671447 100 4 -0.57308316 0.4749221706
9 300 4 -0.53951022 0.97306346 200 4 1.9176422485 0.9879788 200 4 0.40222133 0.3278821640
10 400 4 0.24271562 -1.37269617 300 4 1.4298971045 1.6095265 300 4 0.85799186 0.0006593401
My final out put would look like this:
id t x1 x2
200 1 0.37223136 -0.04918183
300 1 -0.22062416 0.05150952
200 2 0.32912208 1.03922999
300 2 -0.18172302 -1.41669927
200 3 0.45175343 0.64197622
300 3 0.40465989 -0.70796588
200 4 0.09852108 -1.76958443
300 4 -0.53951022 0.97306346
200 1 0.6489171399 -0.1324335
300 1 -0.3669090613 3.0826144
200 2 0.4814364147 -0.1087465
300 2 -0.5676866296 0.2371230
200 3 0.0006852893 0.5830704
300 3 -0.0008717189 -1.1969493
200 4 1.9176422485 0.9879788
300 4 1.4298971045 1.6095265
200 1 -0.41387676 -0.4566678425
300 1 0.48237987 -0.0325861333
200 2 -0.31308242 -0.3021501845
300 2 -1.52273626 0.6357750776
200 3 0.01726120 -0.5905109745
300 3 -0.18603781 0.3722390396
200 4 0.40222133 0.3278821640
300 4 0.85799186 0.0006593401
Upvotes: 0
Views: 168
Reputation: 6528
One strategy is to count the number of times each combination of id
and t
appears. If this is equals the maximum possible, then keep that id. (I used max
to get the maximum possible combinations, but that only works if at least one id
has appeared in every t
.
I use adply
from the plyr
package here to replace your rbind
step, because adply
preserves information about which data frame each row came from (in the X1
column).
library(plyr)
rb <- adply(list(df, df2, df3), 1)
unique_combo <- unique(rb[, c("X1", "id", "t")])
## X1 id t
## 1 1 200 1
## 2 1 300 1
## 3 1 200 2
## 4 1 300 2
## 5 1 100 3
## 6 1 200 3
## 7 1 300 3
## 8 1 200 4
## 9 1 300 4
## 10 1 400 4
## 11 2 200 1
## 12 2 300 1 etc.
combos_per_id <- aggregate(t ~ id, FUN = length, data = unique_combo)
## id t
## 1 100 2
## 2 200 12
## 3 300 12
## 4 400 3
## 5 600 1
ids_you_want <- subset(combos_per_id, t == max(t))
## id t
## 2 200 12
## 3 300 12
rb[rb$id %in% ids_you_want$id, ]
## X1 id t x1 x2
## 1 1 200 1 0.41800060 -0.729280896
## 2 1 300 1 -1.26310444 0.649438361
## 3 1 200 2 1.75130801 0.340464369
## 4 1 300 2 -0.47751518 -1.396611139
## 6 1 200 3 -0.11537438 -1.483654622
## 7 1 300 3 -1.33689508 -1.219725112 etc.
Edit to handle another column
library(plyr)
t<-c(1,1,2,2,3,3,3,4,4,4)
id<-c(200,300,200,300,100,200,300,200,300,400)
x1<-rnorm(1:10)
x2<-rnorm(1:10)
r<-c("b","a","a","a","a","a","a","a","a","a")
df<-data.frame(id,t,x1,x2, r)
t<-c(1,1,1,2,2,3,3,3,4,4)
id<-c(200,300,400,200,300,200,300,400,200,300)
x1<-rnorm(1:10)
x2<-rnorm(1:10)
r<-c("a","a","a","a","a","a","a","a","a","a")
df2<-data.frame(id,t,x1,x2, r)
id<-c(200,300,200,300,600,200,300,100,200,300)
t<-c(1,1,2,2,2,3,3,4,4,4)
x1<-rnorm(1:10)
x2<-rnorm(1:10)
r<-c("a","a","a","a","a","a","a","a","a","a")
df3<-data.frame(id,t,x1,x2, r)
rb <- adply(list(df, df2, df3), 1)
unique_combo <- unique(rb[, c("X1", "id", "t", "r")])
(combos_per_id <- aggregate(t ~ id + r, FUN = length, data = unique_combo))
ids_you_want <- subset(combos_per_id, t == max(t))
rb[rb$id %in% ids_you_want$id, ]
Upvotes: 1
Reputation: 10194
This is a bit brute force, but should work:
frames <- list(df,df2,df3)
lu <- function(x)
length(unique(x))
timePeriodsPerDataframe <- sapply(frames,function(x)lu(x))
for(i in seq(length(frames))){
appearsInEveryTimePeriod <- tapply(frames[[i]]$id,
frames[[i]]$t,
lu) == timePeriodsPerDataframe[i]
if(i == 1)
IDsInEveryTimePeriod <- names(tmp[tmp])
else
IDsInEveryTimePeriod <- intersect(names(tmp[tmp]),IDsInEveryTimePeriod)
}
IDsInEveryTimePeriod <- as.numeric(IDsInEveryTimePeriod)
Upvotes: 0