user7677771
user7677771

Reputation: 69

R: expand data frame columnwise with shifted rows of data

- Example Data to work with:

To create a reduced example, this is the output of dput(df):

df <- structure(list(SubjectID = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 
1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 3L, 3L, 3L, 3L, 3L, 3L, 
3L, 3L), .Label = c("1", "2", "3"), class = "factor"), EventNumber = structure(c(1L, 
1L, 1L, 1L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 1L, 
1L, 1L, 1L, 2L, 2L, 2L, 2L), .Label = c("1", "2"), class = "factor"), 
    EventType = structure(c(1L, 1L, 1L, 1L, 2L, 2L, 2L, 1L, 1L, 
    1L, 1L, 2L, 2L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L
    ), .Label = c("A", "B"), class = "factor"), Param1 = c(0.3, 
    0.21, 0.87, 0.78, 0.9, 1.2, 1.4, 1.3, 0.6, 0.45, 0.45, 0.04, 
    0, 0.1, 0.03, 0.01, 0.09, 0.06, 0.08, 0.09, 0.03, 0.04, 0.04, 
    0.02), Param2 = c(45, 38, 76, 32, 67, 23, 27, 784, 623, 54, 
    54, 1056, 487, 341, 671, 859, 7769, 2219, 4277, 4060, 411, 
    440, 224, 57), Param3 = c(1.5, 1.7, 1.65, 1.32, 0.6, 0.3, 
    2.5, 0.4, 1.4, 0.67, 0.67, 0.32, 0.1, 0.15, 0.22, 0.29, 0.3, 
    0.2, 0.8, 1, 0.9, 0.8, 0.3, 0.1), Param4 = c(0.14, 0, 1, 
    0.86, 0, 0.6, 1, 1, 0.18, 0, 0, 0.39, 0, 1, 0.29, 0.07, 0.33, 
    0.53, 0.29, 0.23, 0.84, 0.61, 0.57, 0.59), Param5 = c(0.18, 
    0, 1, 0, 1, 0, 0.09, 1, 0.78, 0, 0, 1, 0.2, 0, 0.46, 0.72, 
    0.16, 0.22, 0.77, 0.52, 0.2, 0.68, 0.58, 0.17), Param6 = c(0, 
    1, 0.75, 0, 0.14, 0, 1, 0, 1, 0.27, 0, 1, 0, 0.23, 0.55, 
    0.86, 1, 0.33, 1, 1, 0.88, 0.75, 0, 0), AbsoluteTime = structure(c(1522533600, 
    1522533602, 1522533604, 1522533604, 1525125600, 1525125602, 
    1525125604, 1519254000, 1519254002, 1519254004, 1519254006, 
    1521759600, 1521759602, 1521759604, 1521759606, 1521759608, 
    1517353224, 1517353226, 1517353228, 1517353230, 1517439600, 
    1517439602, 1517439604, 1517439606), class = c("POSIXct", 
    "POSIXt"), tzone = "")), row.names = c(NA, -24L), class = "data.frame")
df

The real data has 20 subject, EventNumbers ranging from 1 to 100, and parameters are from Param1 to Param40 (depending on the experiment). Row number are around 60 000 observation.

- What I want to achieve:

For df, create n * 40 new columns. # (40 or any number of parameters that will be chosen later.)

Think of n as "steps into the future". Name the 40 * n newly created columns:

Param1_2, Param2_2, Param3_2, ..., Param39_2, Param40_2, ...,

Param1_3, Param2_3, Param3_3, ..., Param39_3, Param40_3, ...,

...,

Param1_n, Param2_n, Param3_n, ..., Param39_n, Param40_n

Resulting in columns

Param1_1, Param2_1, Param1_2, Param2_2, Param1_3, Param2_3, Param1_4, Param2_4, ... Param1_n, Param2_n

So every observation of subset df[X, c(4:9)] will get an additional set of variables with values from df[X+1, c(4:9)] to df[X+n, c(4:9)].

This is what the new df.extended should look like for n = 1:

df.extended <- structure(list(SubjectID = c(1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 
2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3), EventNumber = c(1, 1, 
1, 1, 2, 2, 2, 1, 1, 1, 1, 2, 2, 2, 2, 2, 1, 1, 1, 1, 2, 2, 2, 
2), EventType = c("A", "A", "A", "A", "B", "B", "B", "A", "A", 
"A", "A", "B", "B", "B", "B", "B", "A", "A", "A", "A", "B", "B", 
"B", "B"), Param1 = c(0.3, 0.21, 0.87, 0.78, 0.9, 1.2, 1.4, 1.3, 
0.6, 0.45, 0.45, 0.04, 0, 0.1, 0.03, 0.01, 0.05, 0.07, 0.06, 
0.01, 0.01, 0.01, 0.07, 0.04), Param2 = c(45, 38, 76, 32, 67, 
23, 27, 784, 623, 54, 54, 1056, 487, 341, 671, 859, 1858, 640, 
8181, 220, 99, 86, 170, 495), Param3 = c(1.5, 1.7, 1.65, 1.32, 
0.6, 0.3, 2.5, 0.4, 1.4, 0.67, 0.67, 0.32, 0.1, 0.15, 0.22, 0.29, 
1.5, 0.9, 0.8, 0.9, 0.1, 0, 0.8, 0.1), Param4 = c(0.14, 0, 1, 
0.86, 0, 0.6, 1, 1, 0.18, 0, 0, 0.39, 0, 1, 0.29, 0.07, 0.64, 
0.11, 0.12, 0.32, 0.55, 0.67, 0.83, 0.82), Param5 = c(0.18, 0, 
1, 0, 1, 0, 0.09, 1, 0.78, 0, 0, 1, 0.2, 0, 0.46, 0.72, 0.27, 
0.14, 0.7, 0.67, 0.23, 0.44, 0.61, 0.76), Param6 = c(0, 1, 0.75, 
0, 0.14, 0, 1, 0, 1, 0.27, 0, 1, 0, 0.23, 0.55, 0.86, 1, 0.56, 
0.45, 0.5, 0, 0, 0.89, 0.11), AbsoluteTime = c("2018-04-01 00:00:00", 
"2018-04-01 00:00:02", "2018-04-01 00:00:04", "2018-04-01 00:00:04", 
"2018-05-01 00:00:00", "2018-05-01 00:00:02", "2018-05-01 00:00:04", 
"2018-02-22 00:00:00", "2018-02-22 00:00:02", "2018-02-22 00:00:04", 
"2018-02-22 00:00:06", "2018-03-23 00:00:00", "2018-03-23 00:00:02", 
"2018-03-23 00:00:04", "2018-03-23 00:00:06", "2018-03-23 00:00:08", 
"2018-01-31 00:00:24", "2018-01-31 00:00:26", "2018-01-31 00:00:28", 
"2018-01-31 00:00:30", "2018-02-01 00:00:00", "2018-02-01 00:00:02", 
"2018-02-01 00:00:04", "2018-02-01 00:00:06"), Param1_2 = c(0.21, 
0.87, 0.78, NA, 1.2, 1.4, NA, 0.6, 0.45, 0.45, NA, 0, 0.1, 0.03, 
0.01, NA, 0.07, 0.07, 0.08, NA, 0.09, 0.06, 0.01, NA), Param2_2 = c(38, 
76, 32, NA, 23, 27, NA, 623, 54, 54, NA, 487, 341, 671, 859, 
NA, 6941, 4467, 808, NA, 143, 301, 219, NA), Param3_2 = c(1.7, 
1.65, 1.32, NA, 0.3, 2.5, NA, 1.4, 0.67, 0.67, NA, 0.1, 0.15, 
0.22, 0.29, NA, 1, 1, 0.1, NA, 0.5, 1, 0.3, NA), Param4_2 = c(0, 
1, 0.86, NA, 0.6, 1, NA, 0.18, 0, 0, NA, 0, 1, 0.29, 0.07, NA, 
0.31, 0.16, 0.68, NA, 0.86, 0.47, 0.47, NA), Param5_2 = c(0, 
1, 0, NA, 0, 0.09, NA, 0.78, 0, 0, NA, 0.2, 0, 0.46, 0.72, NA, 
0.29, 0.26, 0.1, NA, 0.88, 0.86, 0.95, NA), Param6_2 = c(1, 0, 
0, NA, 0, 1, NA, 1, 0.27, 0, NA, 0, 0.23, 0.55, 0.86, NA, 0.68, 
0.66, 0, NA, 0.44, 1, 0.22, NA)), row.names = c(NA, 24L), class = "data.frame")
df.extended

How can this be solved without using loops, writing column indexes by hand etc.? Write a function for trial 2 and use doBy?

My thoughts and what I have done so far to solve this:

  1. Trial 1:

    1. Cycle through the SubjectIDs in a for-loop
    2. In an inner for-loop, cycle through the EventNumber
    3. In another inner for-loop, cycle through the rows
    4. Get the first row by grabbing df[1, ] and save into df.temp
    5. Merge df.temp with df[2, parameters] #
    6. Merge merge df.temp with df[3, parameters] and so on
    7. Save all resulting df.temps into df.final

    Problems I ran into: Step 5:

    df.temp <- df[1,]
    df.temp <- merge(df.temp, df[2, !(colnames(df) == "AbsoluteTime")], by = c("SubjectID", "EventNumber", "EventType"))
    df.temp <- merge(df.temp, df[3, !(colnames(df) == "AbsoluteTime")], by = c("SubjectID", "EventNumber", "EventType"))
    df.temp <- merge(df.temp, df[4, !(colnames(df) == "AbsoluteTime")], by = c("SubjectID", "EventNumber", "EventType"))
    Warning:
    In merge.data.frame(df.temp, df[4, ], by = c("SubjectID", "EventNumber",  :
      column names ‘Param1.x’, ‘Param2.x’, ‘Param3.x’, ‘Param4.x’, ‘Param5.x’, ‘Param6.x’, ‘AbsoluteTime.x’, ‘Param1.y’, ‘Param2.y’,
    

    ‘Param3.y’, ‘Param4.y’, ‘Param5.y’, ‘Param6.y’, ‘AbsoluteTime.y’ are duplicated in the result.

    • The column names are repeated, see the warning.
    • I can not figure out how to easily create the column names / rename the new columns based on a given column name and variable.

    There must a better way than this:

    n <- 3 
    names_vector <- c()
    for (n in seq(from = c(1), to = n)) {
      for (i in names(df[4:9])) {
      names_vector <- c(names_vector, paste0(i, "_", c(n+1)))
        }
    }
    names(df.temp)[c(4:9)] <- parameters
    names(df.temp)[c(11:ncol(df.temp))] <- names_vector
    names(df.temp)
    
    • Also, how do I prevent the last n-1 rows from breaking the script? This is a lot of work to do by hand and I think quite error prone!?
  2. Trial 2:

    1. Cycle through the SubjectIDs in a for-loop
    2. In an inner for-loop, cycle through the EventNumber
    3. Get all rows of parameters into a new data frame except the first row
    4. Append a row with NAs
    5. use cbind() to merge the rows
    6. Repeat n times.

    This is the code for one SubjectID and one EventNumber:

    df.temp <- df[which(df$SubjectID == "1" & df$EventNumber == "1"), ]
    df.temp2 <- df.temp[2:nrow(df.temp)-1, parameters]
    df.temp2 <- rbind(df.temp2, NA)
    df.temp <- cbind(df.temp, df.temp2)
    df.temp2 <- df.temp[3:nrow(df.temp)-1, parameters]
    df.temp2 <- rbind(df.temp2, NA, NA)
    df.temp <- cbind(df.temp, df.temp2)
    df.temp2 <- df.temp[4:nrow(df.temp)-1, parameters]
    df.temp2 <- rbind(df.temp2, NA, NA, NA)
    df.temp <- cbind(df.temp, df.temp2)
    n <- 3
    names_vector <- c()
    for (n in seq(from = c(1), to = n)) {
      for (i in names(df[4:9])) {
        print(i)
        print(n)
        names_vector <- c(names_vector, paste0(i, "_", c(n+1)))
      }
    }
    names(df.temp)[c(4:9)] <- parameters
    names(df.temp)[c(11:ncol(df.temp))] <- names_vector
    df.temp
    
    • That solves the problem with missing rows (NAs are acceptable in my case).
    • Still lots of work by hand / for loops and error prone!?

Upvotes: 2

Views: 151

Answers (2)

Parfait
Parfait

Reputation: 107567

For base R, consider by to slice by SubjectID, EventNumber, and EventType, and run a merge using a helper group_num. And to run across a series of params, wrap by process in an lapply for list of dataframes that you chain merge on the outside for final merge with original dataframe:

df_list <- lapply(2:3, function(i) {
  # BUILD LIST OF DATAFRAMES
  by_list <- by(df, df[c("SubjectID", "EventNumber", "EventType")], FUN=function(sub){

    sub$grp_num <- 1:nrow(sub)
    row_less_sub <- transform(sub, AbsoluteTime=NULL, grp_num=grp_num-(i-1))

    merge(sub, row_less_sub, by=c("SubjectID", "EventNumber", "EventType", "grp_num"), 
          all.x=TRUE, suffixes = c("", paste0("_", i)))
  })

  # APPEND ALL DATAFRAMES IN LIST
  grp_df <- do.call(rbind, by_list)
  grp_df <- with(grp_df, grp_df[order(SubjectID, EventNumber),])
  # KEEP NEEDED COLUMNS
  grp_df <- grp_df[c("SubjectID", "EventNumber", "EventType", "grp_num",
                   names(grp_df)[grep("Param[0-9]_", names(grp_df))])]
  row.names(grp_df) <- NULL

  return(grp_df)
})

# ALL PARAMS_* CHAIN MERGE
params_df <- Reduce(function(x,y) merge(x, y, by=c("SubjectID", "EventNumber", "EventType", "grp_num")), df_list)

# ORIGINAL DF AND PARAMS MERGE
df$grp_num <- ave(df$Param1, df$SubjectID, df$EventNumber, df$EventType, 
                  FUN=function(x) cumsum(rep(1, length(x))))

final_df <- transform(merge(df, params_df, by=c("SubjectID", "EventNumber", "EventType", "grp_num")), grp_num=NULL)

Output

head(final_df, 10)

#    SubjectID EventNumber EventType Param1 Param2 Param3 Param4 Param5 Param6        AbsoluteTime Param1_2 Param2_2 Param3_2 Param4_2 Param5_2 Param6_2 Param1_3 Param2_3 Param3_3 Param4_3 Param5_3 Param6_3
# 1          1           1         A   0.30     45   1.50   0.14   0.18   0.00 2018-03-31 17:00:00     0.21       38     1.70     0.00     0.00     1.00     0.87       76     1.65     1.00     1.00     0.75
# 2          1           1         A   0.21     38   1.70   0.00   0.00   1.00 2018-03-31 17:00:02     0.87       76     1.65     1.00     1.00     0.75     0.78       32     1.32     0.86     0.00     0.00
# 3          1           1         A   0.87     76   1.65   1.00   1.00   0.75 2018-03-31 17:00:04     0.78       32     1.32     0.86     0.00     0.00       NA       NA       NA       NA       NA       NA
# 4          1           1         A   0.78     32   1.32   0.86   0.00   0.00 2018-03-31 17:00:04       NA       NA       NA       NA       NA       NA       NA       NA       NA       NA       NA       NA
# 5          1           2         B   0.90     67   0.60   0.00   1.00   0.14 2018-04-30 17:00:00     1.20       23     0.30     0.60     0.00     0.00     1.40       27     2.50     1.00     0.09     1.00
# 6          1           2         B   1.20     23   0.30   0.60   0.00   0.00 2018-04-30 17:00:02     1.40       27     2.50     1.00     0.09     1.00       NA       NA       NA       NA       NA       NA
# 7          1           2         B   1.40     27   2.50   1.00   0.09   1.00 2018-04-30 17:00:04       NA       NA       NA       NA       NA       NA       NA       NA       NA       NA       NA       NA
# 8          2           1         A   1.30    784   0.40   1.00   1.00   0.00 2018-02-21 17:00:00     0.60      623     1.40     0.18     0.78     1.00     0.45       54     0.67     0.00     0.00     0.27
# 9          2           1         A   0.60    623   1.40   0.18   0.78   1.00 2018-02-21 17:00:02     0.45       54     0.67     0.00     0.00     0.27     0.45       54     0.67     0.00     0.00     0.00
# 10         2           1         A   0.45     54   0.67   0.00   0.00   0.27 2018-02-21 17:00:04     0.45       54     0.67     0.00     0.00     0.00       NA       NA       NA       NA       NA       NA

Upvotes: 1

Alanisson
Alanisson

Reputation: 38

What about something like this:

You can use the developer version of the package dplyr to add and rename variables according to various subsets of interest in your data. dplyr also provides the functions lead()and lag(), which can be used to find the "next" or "previous" values in a vector (or here row). You can use lead() in combination with the function mutate_at() to extract the values from the succeeding "nth"-row and use them to create new set of variables.

Here I use the data you provided in your example:

# load dplyr package
require(dplyr)

# creacte new data frame "df.extended"
df.extended <- df

# number of observations per group (e.g., SubjectID)
# or desired number of successions
obs = 3

# loop until number of successions achieved
for (i in 1:obs) {

  # overwrite df.extended with new information
   df.extended <- df.extended %>% 
     # group by subjects and events
     group_by(SubjectID, EventNumber) %>%
     # create new variable for each parameter
     mutate_at( vars(Param1:Param6), 
                # using the lead function
                .funs = funs(step = lead),
                # for the nth followning row
                n = i) %>% 
     # rename the new variables to show the succession number
     rename_at(vars(contains("_step")), funs(sub("step", as.character(i), .)))

}

This should roughly recreate the data you posted as desired result.

# Look at first part of "df.extended"
> head(df.extended)

# A tibble: 6 x 28
# Groups:   SubjectID, EventNumber [2]
  SubjectID EventNumber EventType Param1 Param2 Param3 Param4 Param5 Param6 AbsoluteTime        Param1_1 Param2_1 Param3_1 Param4_1 Param5_1 Param6_1
  <fct>     <fct>       <fct>      <dbl>  <dbl>  <dbl>  <dbl>  <dbl>  <dbl> <dttm>                 <dbl>    <dbl>    <dbl>    <dbl>    <dbl>    <dbl>
1 1         1           A          0.300    45.  1.50   0.140  0.180  0.    2018-04-01 00:00:00    0.210      38.    1.70     0.      0.        1.00 
2 1         1           A          0.210    38.  1.70   0.     0.     1.00  2018-04-01 00:00:02    0.870      76.    1.65     1.00    1.00      0.750
3 1         1           A          0.870    76.  1.65   1.00   1.00   0.750 2018-04-01 00:00:04    0.780      32.    1.32     0.860   0.        0.   
4 1         1           A          0.780    32.  1.32   0.860  0.     0.    2018-04-01 00:00:04   NA          NA    NA       NA      NA        NA    
5 1         2           B          0.900    67.  0.600  0.     1.00   0.140 2018-05-01 00:00:00    1.20       23.    0.300    0.600   0.        0.   
6 1         2           B          1.20     23.  0.300  0.600  0.     0.    2018-05-01 00:00:02    1.40       27.    2.50     1.00    0.0900    1.00 
# ... with 12 more variables: Param1_2 <dbl>, Param2_2 <dbl>, Param3_2 <dbl>, Param4_2 <dbl>, Param5_2 <dbl>, Param6_2 <dbl>, Param1_3 <dbl>,
#   Param2_3 <dbl>, Param3_3 <dbl>, Param4_3 <dbl>, Param5_3 <dbl>, Param6_3 <dbl>

Upvotes: 1

Related Questions