Erik Hahn
Erik Hahn

Reputation: 53

How to create additional rows or a dataframe based on column values

Here is some sample data:

df <- data.frame(ID = c(1, 2, 2), 
                 Amount = c(100, 10, 100),
                 Time = c(0, 0, 24),
                 ADDL = c(5, 0, 1),
                 II = c(24, 24, 24))

Using this dataframe I would like to generate the following rows/dataframe, where ADDL is the number of additional rows that should be generated for a given ID, and the II term is the interval in time that each new row should take.

#      ID Amount Time 
# 1    1  100    0
# 2    1  100    24
# 3    1  100    48
# 4    1  100    72
# 5    1  100    96
# 6    1  100    120
# 7    2  10     0
# 8    2  100    24
# 9    2  100    48

The output dataframe that I would generate will have >350000 rows so an efficient solution is needed.

Upvotes: 2

Views: 239

Answers (2)

Erik Hahn
Erik Hahn

Reputation: 53

#fastest option stays away from adding rows using a loop

#add rows function
addRows <- function(x){
  Times <- seq(from = x$Time, to = x$Time + x$ADDL*x$II, by = x$II)
  n <- length(Times)
  x %>% 
    dplyr::slice(rep(row_number(),n)) %>%
    dplyr::mutate(Time = Times)
}

df %>%
  dplyr::group_by(USUBJID, TIME, ADDL) %>%
  dplyr::group_split() %>%
  purrr::map_dfr( ~ addRows(.))

Upvotes: 1

YBS
YBS

Reputation: 21287

Solution 3: use bind_rows as shown below. This is faster than the other two solutions.

####  solution 3
  df3 <- setDT(df)
  mydf3 <- setDT(df)
  m1 <- proc.time()
  datalist = list()
  
  for (i in 1:n) {
    dlist = list()
    if ( df3$ADDL[i]>0){
      for(j in 1:df3$ADDL[i]){
        dat <- data.table(ID=df3$ID[i],
                          Amount=df3$Amount[i],
                          Time=df3$Time[i]+j*df3$II[i],
                          ADDL=df3$ADDL[i],
                          II=df3$II[i])
        dlist[[j]] <- dat # add it to your list
      }
      datalist[[i]] <- dlist
    }
    
  }
  datalist[[n+1]] <- mydf3
  all_data <- dplyr::bind_rows(datalist)
  all_data[order(ID,Time)]
  m2 <- proc.time()
  print((m2 - m1)[[3]])

>   all_data[order(ID,Time)]
   ID Amount Time ADDL II
1:  1    100    0    5 24
2:  1    100   24    5 24
3:  1    100   48    5 24
4:  1    100   72    5 24
5:  1    100   96    5 24
6:  1    100  120    5 24
7:  2     10    0    0 24
8:  2    100   24    1 24
9:  2    100   48    1 24

OLD ANSWER: You can use lapply to accomplish this. The following code gives the expected result on your df. You can drop the variables you don't need from mydf dataframe. UPDATE: I have two solutions. For large amount of data, second solution is better as it uses rbindlist and data.table - both implemented in C and therefore, more efficient.

##  Solution 1 in R should suffice for small data frames

mydf <- df
  lapply(1:n, function(i){
    if (df$ADDL[i]>0){
      lapply(1:df$ADDL[i], function(j){
        mydf<<- rbind(mydf,data.frame(ID=df$ID[i],
                                      Amount=df$Amount[i],
                                      Time=df$Time[i]+j*df$II[i],
                                      ADDL=df$ADDL[i],
                                      II=df$II[i]))
      })
    }
  })
  
  mydf[order(mydf$ID,mydf$Time),]
  
  ##  Solution 2 is more efficient for large data - uses data.table and rbindlist
  
  mydf2 <- setDT(df)
  lapply(1:n, function(i){
    if (df$ADDL[i]>0){
      lapply(1:df$ADDL[i], function(j){
        addrow <- data.table(ID=df$ID[i],
                             Amount=df$Amount[i],
                             Time=df$Time[i]+j*df$II[i],
                             ADDL=df$ADDL[i],
                             II=df$II[i])
        mydf2<<- rbindlist(list(mydf2,addrow), use.names=TRUE)
      })
    }
  })
  
  mydf2[order(ID,Time)]
  
  identical(mydf2, setDT(mydf)) 

> identical(mydf2, setDT(mydf))
[1] TRUE  

Upvotes: 2

Related Questions