jakes
jakes

Reputation: 2085

How to create sub data frames for each row with data before and after a row

I've got a data like below:

ex <- structure(list(timestamp = structure(c(1502480763.554, 1502480763.554, 
1502480764.968, 1502480765.554, 1502480768.554, 1502480770.554, 
1502480773.519, 1502480775.72, 1502480777.43, 1502480778.278, 
1502480778.288, 1502480778.759, 1502480780.472, 1502480782.815, 
1502480785.521, 1502480785.531, 1502480785.707, 1502480787.639, 
1502480789.1, 1502480790.682, 1502480791.554, 1502480793.322, 
1502480794.363, 1502480795.923, 1502480799.239, 1502480800.27, 
1502480800.554, 1502480802.554, 1502480805.63, 1502480805.959, 
1502480807.327, 1502480809.554, 1502480809.564, 1502480810.554, 
1502480812.8, 1502480813.838, 1502480813.848, 1502480816.24, 
1502480816.24, 1502480835.56, 1502480838.576, 1502480848.384, 
1502480851.859, 1502480853.554, 1502480856.375, 1502480857.688, 
1502480905.554, 1502480910.554, 1502480910.945, 1502480911.816
), class = c("POSIXct", "POSIXt"), tzone = "UTC"), order = c(NA, 
NA, 1L, 1L, 1L, 1L, 1L, 1L, NA, NA, 2L, 2L, 2L, 2L, NA, NA, NA, 
3L, NA, 4L, 4L, 4L, 4L, 4L, NA, 5L, 5L, 5L, 6L, 6L, 6L, NA, NA, 
NA, NA, NA, 7L, NA, 7L, 8L, 8L, 8L, 8L, 8L, 8L, 8L, 9L, 9L, 10L, 
10L), cat = c(0, 0, 1, 1, 1, 1, 1, 1, 1, 99, 99, 1, 1, 1, 99, 
99, 21, 1, 1, 1, 94, 1, 1, 1, 1, 1, 1, 1, 94, 1, 1, 99, 99, 1, 
61, 10, 3, 4, 4, 1, 1, 1, 1, 1, 1, 16, 1, 1, 13, 94), var1 = c(1L, 
1L, 1L, 1L, 1L, 1L, 1L, 0L, 0L, 0L, 1L, 1L, 1L, 0L, 0L, 1L, 1L, 
0L, 1L, 1L, 1L, 1L, 1L, 0L, 0L, 1L, 1L, 0L, 1L, 1L, 0L, 0L, 1L, 
0L, 1L, 0L, 1L, 0L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 0L, 1L, 
1L), var2 = c(NA, NA, 2.5, 2.5, 2.5, 2.5, 2.5, 2.5, NA, NA, 0.9, 
0.9, 0.9, 0.9, NA, NA, NA, NA, NA, 5.3, 5.3, 5.3, 5.3, 5.3, NA, 
8.6, 8.6, 8.6, 14.5, 14.5, 14.5, NA, NA, NA, NA, NA, 7.4, NA, 
7.4, 1.2, 1.2, 1.2, 1.2, 1.2, 1.2, 1.2, 4.6, 4.6, -4.1, -4.1), 
    var3 = c(NA, NA, 35.8, 59.3, 51.3, 57.3, 77.5, 82.4, 41.6, 
    NA, NA, 66.8, 53, 77.1, NA, NA, 55.8, 81.4, 45.8, 37.9, NA, 
    38.5, 32, 72, 46.9, 76.4, 76.9, 88, NA, 11.7, 49.4, NA, NA, 
    64.1, NA, NA, NA, NA, NA, 72.5, 77.7, 83.3, 96.4, 83.3, 95.3, 
    NA, 69.8, 78.9, NA, NA), var4 = c(NA, NA, 26.6, 24, 9.7, 
    12.7, 21, 12.7, 9.7, NA, NA, 14, 20.3, 25.6, NA, NA, 18.6, 
    25.3, 15.7, 10.7, NA, 12.8, 8, 41.9, 12.8, 8.5, 10.2, 14.3, 
    NA, 19.3, 40, NA, NA, 1.2, NA, NA, NA, NA, NA, 10, 21.9, 
    19, 42, 11.8, 18.4, NA, 33.5, 3.7, NA, NA), var5 = c(NA, 
    NA, 2.8, 5.2, 2.3, 4.4, -0.9, 0.3, -0.8, NA, NA, 1.3, 1.5, 
    5.2, NA, NA, -0.7, -0.9, -0.3, 2.8, NA, 0.3, 1.8, 5.3, -0.9, 
    4.9, 0.9, 4.8, NA, 1.6, -0.8, NA, NA, -0.7, NA, NA, NA, NA, 
    NA, 0.4, 0.4, 2.2, 4.2, 1.5, -0.1, NA, 0.3, 1.8, NA, NA), 
    var6 = c(NA, NA, NA, NA, NA, TRUE, NA, NA, TRUE, NA, NA, 
    TRUE, TRUE, NA, NA, NA, NA, NA, TRUE, TRUE, NA, NA, NA, NA, 
    TRUE, TRUE, NA, NA, NA, NA, NA, NA, NA, TRUE, NA, NA, NA, 
    NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA)), class = c("tbl_df", 
"tbl", "data.frame"), row.names = c(NA, -50L))

Within the same values of column order I need to create (for each row) two nested sub-dataframes - one with data before and one with data in this row and after. So let's take for example a block of data where order == 1:

ex %>% filter(order == 1) %>% print()

# A tibble: 6 x 9
  timestamp           order   cat  var1  var2  var3  var4   var5 var6 
  <dttm>              <int> <dbl> <int> <dbl> <dbl> <dbl>  <dbl> <lgl>
1 2017-08-11 19:46:04     1     1     1   2.5  35.8  26.6  2.8   NA   
2 2017-08-11 19:46:05     1     1     1   2.5  59.3  24    5.20  NA   
3 2017-08-11 19:46:08     1     1     1   2.5  51.3   9.7  2.3   NA   
4 2017-08-11 19:46:10     1     1     1   2.5  57.3  12.7  4.40  TRUE 
5 2017-08-11 19:46:13     1     1     1   2.5  77.5  21   -0.9   NA   
6 2017-08-11 19:46:15     1     1     0   2.5  82.4  12.7  0.300 NA   

I need two additional columns with nested data frames: data_before and data_after. For first row data_before would be empty and data_after would contain all the rows. For second row, data_before would contain only first row and data_after would contain rows from 2 to 6. For third row, data_before would contain first two rows and data_after would contains rows from 3 to 6 and so on... Such an operation need to be performed for every value of order in original data frame. How it can be accomplished?

Expected output for one block of data (with order == 1) would be:

structure(list(order = c(1, 1, 1, 1, 1, 1), data_before = list(
    structure(list(), .Names = character(0), row.names = integer(0), class = "data.frame"), 
    structure(list(timestamp = structure(1502480764.968, class = c("POSIXct", 
    "POSIXt"), tzone = "UTC"), cat = 1, var1 = 1L, var2 = 2.5, 
        var3 = 35.8, var4 = 26.6, var5 = 2.8, var6 = NA), class = c("tbl_df", 
    "tbl", "data.frame"), row.names = c(NA, -1L)), structure(list(
        timestamp = structure(c(1502480764.968, 1502480765.554
        ), class = c("POSIXct", "POSIXt"), tzone = "UTC"), cat = c(1, 
        1), var1 = c(1L, 1L), var2 = c(2.5, 2.5), var3 = c(35.8, 
        59.3), var4 = c(26.6, 24), var5 = c(2.8, 5.2), var6 = c(NA, 
        NA)), class = c("tbl_df", "tbl", "data.frame"), row.names = c(NA, 
    -2L)), structure(list(timestamp = structure(c(1502480764.968, 
    1502480765.554, 1502480768.554), class = c("POSIXct", "POSIXt"
    ), tzone = "UTC"), cat = c(1, 1, 1), var1 = c(1L, 1L, 1L), 
        var2 = c(2.5, 2.5, 2.5), var3 = c(35.8, 59.3, 51.3), 
        var4 = c(26.6, 24, 9.7), var5 = c(2.8, 5.2, 2.3), var6 = c(NA, 
        NA, NA)), class = c("tbl_df", "tbl", "data.frame"), row.names = c(NA, 
    -3L)), structure(list(timestamp = structure(c(1502480764.968, 
    1502480765.554, 1502480768.554, 1502480770.554), class = c("POSIXct", 
    "POSIXt"), tzone = "UTC"), cat = c(1, 1, 1, 1), var1 = c(1L, 
    1L, 1L, 1L), var2 = c(2.5, 2.5, 2.5, 2.5), var3 = c(35.8, 
    59.3, 51.3, 57.3), var4 = c(26.6, 24, 9.7, 12.7), var5 = c(2.8, 
    5.2, 2.3, 4.4), var6 = c(NA, NA, NA, TRUE)), class = c("tbl_df", 
    "tbl", "data.frame"), row.names = c(NA, -4L)), structure(list(
        timestamp = structure(c(1502480764.968, 1502480765.554, 
        1502480768.554, 1502480770.554, 1502480773.519), class = c("POSIXct", 
        "POSIXt"), tzone = "UTC"), cat = c(1, 1, 1, 1, 1), var1 = c(1L, 
        1L, 1L, 1L, 1L), var2 = c(2.5, 2.5, 2.5, 2.5, 2.5), var3 = c(35.8, 
        59.3, 51.3, 57.3, 77.5), var4 = c(26.6, 24, 9.7, 12.7, 
        21), var5 = c(2.8, 5.2, 2.3, 4.4, -0.9), var6 = c(NA, 
        NA, NA, TRUE, NA)), class = c("tbl_df", "tbl", "data.frame"
    ), row.names = c(NA, -5L))), data_after = list(structure(list(
    timestamp = structure(c(1502480764.968, 1502480765.554, 1502480768.554, 
    1502480770.554, 1502480773.519, 1502480775.72), class = c("POSIXct", 
    "POSIXt"), tzone = "UTC"), cat = c(1, 1, 1, 1, 1, 1), var1 = c(1L, 
    1L, 1L, 1L, 1L, 0L), var2 = c(2.5, 2.5, 2.5, 2.5, 2.5, 2.5
    ), var3 = c(35.8, 59.3, 51.3, 57.3, 77.5, 82.4), var4 = c(26.6, 
    24, 9.7, 12.7, 21, 12.7), var5 = c(2.8, 5.2, 2.3, 4.4, -0.9, 
    0.3), var6 = c(NA, NA, NA, TRUE, NA, NA)), class = c("tbl_df", 
"tbl", "data.frame"), row.names = c(NA, -6L)), structure(list(
    timestamp = structure(c(1502480765.554, 1502480768.554, 1502480770.554, 
    1502480773.519, 1502480775.72), class = c("POSIXct", "POSIXt"
    ), tzone = "UTC"), cat = c(1, 1, 1, 1, 1), var1 = c(1L, 1L, 
    1L, 1L, 0L), var2 = c(2.5, 2.5, 2.5, 2.5, 2.5), var3 = c(59.3, 
    51.3, 57.3, 77.5, 82.4), var4 = c(24, 9.7, 12.7, 21, 12.7
    ), var5 = c(5.2, 2.3, 4.4, -0.9, 0.3), var6 = c(NA, NA, TRUE, 
    NA, NA)), class = c("tbl_df", "tbl", "data.frame"), row.names = c(NA, 
-5L)), structure(list(timestamp = structure(c(1502480768.554, 
1502480770.554, 1502480773.519, 1502480775.72), class = c("POSIXct", 
"POSIXt"), tzone = "UTC"), cat = c(1, 1, 1, 1), var1 = c(1L, 
1L, 1L, 0L), var2 = c(2.5, 2.5, 2.5, 2.5), var3 = c(51.3, 57.3, 
77.5, 82.4), var4 = c(9.7, 12.7, 21, 12.7), var5 = c(2.3, 4.4, 
-0.9, 0.3), var6 = c(NA, TRUE, NA, NA)), class = c("tbl_df", 
"tbl", "data.frame"), row.names = c(NA, -4L)), structure(list(
    timestamp = structure(c(1502480770.554, 1502480773.519, 1502480775.72
    ), class = c("POSIXct", "POSIXt"), tzone = "UTC"), cat = c(1, 
    1, 1), var1 = c(1L, 1L, 0L), var2 = c(2.5, 2.5, 2.5), var3 = c(57.3, 
    77.5, 82.4), var4 = c(12.7, 21, 12.7), var5 = c(4.4, -0.9, 
    0.3), var6 = c(TRUE, NA, NA)), class = c("tbl_df", "tbl", 
"data.frame"), row.names = c(NA, -3L)), structure(list(timestamp = structure(c(1502480773.519, 
1502480775.72), class = c("POSIXct", "POSIXt"), tzone = "UTC"), 
    cat = c(1, 1), var1 = 1:0, var2 = c(2.5, 2.5), var3 = c(77.5, 
    82.4), var4 = c(21, 12.7), var5 = c(-0.9, 0.3), var6 = c(NA, 
    NA)), class = c("tbl_df", "tbl", "data.frame"), row.names = c(NA, 
-2L)), structure(list(timestamp = structure(1502480775.72, class = c("POSIXct", 
"POSIXt"), tzone = "UTC"), cat = 1, var1 = 0L, var2 = 2.5, var3 = 82.4, 
    var4 = 12.7, var5 = 0.3, var6 = NA), class = c("tbl_df", 
"tbl", "data.frame"), row.names = c(NA, -1L)))), class = c("tbl_df", 
"tbl", "data.frame"), row.names = c(NA, -6L))

Upvotes: 4

Views: 239

Answers (3)

Ronak Shah
Ronak Shah

Reputation: 389047

Using tidyverse we can split on order and for each dataframe create two new columns data_before and data_after which would contain a list of dataframes based on the conditions.

library(tidyverse)

ex %>%
  group_split(order) %>%
  map_dfr(. %>% 
       mutate(data_before = map(seq_len(nrow(.)), function(y) .[seq_len(y - 1), ]), 
              data_after = map(seq_len(nrow(.)), function(y) 
                         if (y == nrow(.)) .[0,] else .[(y + 1):nrow(.), ]))) %>%
  select(order, data_before, data_after)


# A tibble: 50 x 3
#   order data_before      data_after      
#   <int> <list>           <list>          
# 1     1 <tibble [0 × 9]> <tibble [5 × 9]>
# 2     1 <tibble [1 × 9]> <tibble [4 × 9]>
# 3     1 <tibble [2 × 9]> <tibble [3 × 9]>
# 4     1 <tibble [3 × 9]> <tibble [2 × 9]>
# 5     1 <tibble [4 × 9]> <tibble [1 × 9]>
# 6     1 <tibble [5 × 9]> <tibble [0 × 9]>
# 7     2 <tibble [0 × 9]> <tibble [3 × 9]>
# 8     2 <tibble [1 × 9]> <tibble [2 × 9]>
# 9     2 <tibble [2 × 9]> <tibble [1 × 9]>
#10     2 <tibble [3 × 9]> <tibble [0 × 9]>
# … with 40 more rows

This can also be translated in base R in the following way

do.call(rbind, lapply(split(ex, ex$order), function(x) {
     x$data_before <- lapply(seq_len(nrow(x)), function(y) x[seq_len(y - 1), ])
     x$data_after <-  lapply(seq_len(nrow(x)), function(y) 
                       if (y == nrow(x)) x[0,] else x[(y + 1):nrow(x), ])
     x
}))

Upvotes: 0

Simon
Simon

Reputation: 602

Or this:

ex.list <- lapply(split(ex, ex$order), function(x){
  ex.x <- as.data.frame(do.call(rbind, 
          lapply(1:nrow(x), function(i){
            c(x$order[i], ifelse(i==1, list(data.frame()), list(x[1:(i-1), ])), list(x[i:nrow(x), ]))
          })
  ))
  names(ex.x) <- c('order', 'data_before', 'data_after')
  ex.x
})

Edit: Trying to give some more explanation to the code posted before:

# lapply() applies a function (input arg 2) to each element of a list (input arg 1) 
# and returns a list of return values of the function applied on each input element
ex.list <- lapply( 
  # the split() function returns a list of data.frames, subsets of ex 
  # splitted by ex$order; these will be the input for the 1. lapply() call
  split(ex, ex$order),
  # the following function will be applied to each of these data.farmes 
  # to create the return values 
  function(x){ # 'x' will be a data.frame, subset ox 'ex' with one single value of ex$order
    list.of.rows <- lapply(# we now loop over each row in the data.frame 
                           # containing data with one single value of ex$order, 
                           # 'i' is the row number
                           1:nrow(x), 
                           # the functions will create 1 row for the resulting data.frame
                           function(i){ 
                             c(# the row is 1 vector containing the following 3 values
                               # the first column of the putput data.frame is the value of ex$order
                               x$order[i], 
                               # the value for row i of data_before
                               ifelse(i==1, list(data.frame()), list(x[1:(i-1), ])), # for the first row we return an empty list, else the data.frame with previous (1:(i-1)) rows
                               # the values for row i of data_after
                               list(x[i:nrow(x), ]) # subset of rows as off row i
                               )
                             })
  # now that we have a list (list.of.rows) that contains one row for the output data.frame
  # we rbind these into one data.frame
  ex.x <- as.data.frame(do.call(rbind, # do.call(rbind, ...) cobines elements of ... using rbind()
                                list.of.rows 
  ))
  names(ex.x) <- c('order', 'data_before', 'data_after') # give column names to the output data.frame
  ex.x # define the return value of the function of the 1. lapply() call
})

Upvotes: 1

Rafael Toledo
Rafael Toledo

Reputation: 1054

Check this:

library(tidyverse)

slice_dataframe <- function(r, ord = 1) {
    tibble("order" = ord,
           "data_before" = list(slice(ex, row_number() <= (r - ord))),
           "data_after"  = list(slice(ex, row_number() >= (r + ord))))
}

map_df(1:nrow(ex), slice_dataframe)

Upvotes: 1

Related Questions