Reputation: 53
Here is some sample data:
df <- data.frame(ID = c(1, 2, 2),
Amount = c(100, 10, 100),
Time = c(0, 0, 24),
ADDL = c(5, 0, 1),
II = c(24, 24, 24))
Using this dataframe I would like to generate the following rows/dataframe, where ADDL is the number of additional rows that should be generated for a given ID, and the II term is the interval in time that each new row should take.
# ID Amount Time
# 1 1 100 0
# 2 1 100 24
# 3 1 100 48
# 4 1 100 72
# 5 1 100 96
# 6 1 100 120
# 7 2 10 0
# 8 2 100 24
# 9 2 100 48
The output dataframe that I would generate will have >350000 rows so an efficient solution is needed.
Upvotes: 2
Views: 239
Reputation: 53
#fastest option stays away from adding rows using a loop
#add rows function
addRows <- function(x){
Times <- seq(from = x$Time, to = x$Time + x$ADDL*x$II, by = x$II)
n <- length(Times)
x %>%
dplyr::slice(rep(row_number(),n)) %>%
dplyr::mutate(Time = Times)
}
df %>%
dplyr::group_by(USUBJID, TIME, ADDL) %>%
dplyr::group_split() %>%
purrr::map_dfr( ~ addRows(.))
Upvotes: 1
Reputation: 21287
Solution 3: use bind_rows
as shown below. This is faster than the other two solutions.
#### solution 3
df3 <- setDT(df)
mydf3 <- setDT(df)
m1 <- proc.time()
datalist = list()
for (i in 1:n) {
dlist = list()
if ( df3$ADDL[i]>0){
for(j in 1:df3$ADDL[i]){
dat <- data.table(ID=df3$ID[i],
Amount=df3$Amount[i],
Time=df3$Time[i]+j*df3$II[i],
ADDL=df3$ADDL[i],
II=df3$II[i])
dlist[[j]] <- dat # add it to your list
}
datalist[[i]] <- dlist
}
}
datalist[[n+1]] <- mydf3
all_data <- dplyr::bind_rows(datalist)
all_data[order(ID,Time)]
m2 <- proc.time()
print((m2 - m1)[[3]])
> all_data[order(ID,Time)]
ID Amount Time ADDL II
1: 1 100 0 5 24
2: 1 100 24 5 24
3: 1 100 48 5 24
4: 1 100 72 5 24
5: 1 100 96 5 24
6: 1 100 120 5 24
7: 2 10 0 0 24
8: 2 100 24 1 24
9: 2 100 48 1 24
OLD ANSWER:
You can use lapply
to accomplish this. The following code gives the expected result on your df
. You can drop the variables you don't need from mydf
dataframe.
UPDATE: I have two solutions. For large amount of data, second solution is better as it uses rbindlist and data.table - both implemented in C and therefore, more efficient.
## Solution 1 in R should suffice for small data frames
mydf <- df
lapply(1:n, function(i){
if (df$ADDL[i]>0){
lapply(1:df$ADDL[i], function(j){
mydf<<- rbind(mydf,data.frame(ID=df$ID[i],
Amount=df$Amount[i],
Time=df$Time[i]+j*df$II[i],
ADDL=df$ADDL[i],
II=df$II[i]))
})
}
})
mydf[order(mydf$ID,mydf$Time),]
## Solution 2 is more efficient for large data - uses data.table and rbindlist
mydf2 <- setDT(df)
lapply(1:n, function(i){
if (df$ADDL[i]>0){
lapply(1:df$ADDL[i], function(j){
addrow <- data.table(ID=df$ID[i],
Amount=df$Amount[i],
Time=df$Time[i]+j*df$II[i],
ADDL=df$ADDL[i],
II=df$II[i])
mydf2<<- rbindlist(list(mydf2,addrow), use.names=TRUE)
})
}
})
mydf2[order(ID,Time)]
identical(mydf2, setDT(mydf))
> identical(mydf2, setDT(mydf))
[1] TRUE
Upvotes: 2