Mikael Jumppanen
Mikael Jumppanen

Reputation: 2486

Addition of a new column to longitudinal data in R, based on matching, time, response and grouping information.

Example data consists of 4 columns:

The task is to calculate a new column based on grouping information in a "TreatmentGroup" column and matching information in a "Match" column. New column will contain response variable of treatmentX - control of that matched group.
For example: response of treatment 2 - control(1.2 -1.8 = -0.6) and next row 1.4-2.0=-0.6. So treatment response is compared to control response at a given time (0 or 1). Example data and result table(manually calculated):

   TreatmentGroup  ID Response Time   Match
1      treatment2 ID1      1.2    0 group_1
2      treatment2 ID1      1.4    1 group_1
3         control ID2      1.8    0 group_1
4         control ID2      2.0    1 group_1
5      treatment1 ID3      1.5    0 group_1
6      treatment1 ID3      1.8    1 group_1
7      treatment2 ID4      0.2    0 group_2
8      treatment2 ID4      0.3    1 group_2
9         control ID5      2.5    0 group_2
10        control ID5      2.8    1 group_2
11     treatment1 ID6      3.2    0 group_2
12     treatment1 ID6      3.5    1 group_2


   TreatmentGroup  ID Response Time   Match Paired_sub
1      treatment2 ID1      1.2    0 group_1       -0.6
2      treatment2 ID1      1.4    1 group_1       -0.6
3         control ID2      1.8    0 group_1        0.0
4         control ID2      2.0    1 group_1        0.0
5      treatment1 ID3      1.5    0 group_1       -0.3
6      treatment1 ID3      1.8    1 group_1        0.2
7      treatment2 ID4      0.2    0 group_2       -2.3
8      treatment2 ID4      0.3    1 group_2       -2.5
9         control ID5      2.5    0 group_2        0.0
10        control ID5      2.8    1 group_2        0.0
11     treatment1 ID6      3.2    0 group_2        0.7
12     treatment1 ID6      3.5    1 group_2        0.7

What would be the best approaches(or answer) for this kind of problem?
Code for generating example tables:

df <- data.frame("TreatmentGroup"=c("treatment2", "treatment2", "control", "control",  "treatment1", "treatment1"),
                 "ID" = c("ID1","ID1", "ID2","ID2","ID3","ID3", "ID4","ID4", "ID5","ID5", "ID6","ID6"),
                 "Response"=c(1.2, 1.4, 1.8, 2.0, 1.5, 1.8, 0.2,0.3,2.5,2.8,3.2,3.5),
                 "Time" = c(0,1,0,1,0,1),
                 "Match" = c("group_1", "group_1","group_1", "group_1","group_1", "group_1","group_2", "group_2","group_2", "group_2","group_2", "group_2")
                 )
                 
                 
result <- data.frame("TreatmentGroup"=c("treatment2", "treatment2", "control", "control",  "treatment1", "treatment1"),
                 "ID" = c("ID1","ID1", "ID2","ID2","ID3","ID3", "ID4","ID4", "ID5","ID5", "ID6","ID6"),
                 "Response"=c(1.2, 1.4, 1.8, 2.0, 1.5, 1.8, 0.2,0.3,2.5,2.8,3.2,3.5),
                 "Time" = c(0,1,0,1,0,1),
                 "Match" = c("group_1", "group_1","group_1", "group_1","group_1", "group_1","group_2", "group_2","group_2", "group_2","group_2", "group_2"),
                 "Paired_sub" = c(-0.6,-0.6,0,0,-0.3, 0.2,-2.3,-2.5, 0,0, 0.7,0.7)
                 )     

Upvotes: 0

Views: 237

Answers (2)

akrun
akrun

Reputation: 887213

You can use base R as well

indx <- with(df, paste(Time, Match))
within(df, {indx2 <- TreatmentGroup=='control'
            Paired_sub <- Response - setNames(Response[indx2], 
                 indx[indx2])[setNames(indx, 1:nrow(df))]})[,-7]
#   TreatmentGroup  ID Response Time   Match Paired_sub
#1      treatment2 ID1      1.2    0 group_1       -0.6
#2      treatment2 ID1      1.4    1 group_1       -0.6
#3         control ID2      1.8    0 group_1        0.0
#4         control ID2      2.0    1 group_1        0.0
#5      treatment1 ID3      1.5    0 group_1       -0.3
#6      treatment1 ID3      1.8    1 group_1       -0.2
#7      treatment2 ID4      0.2    0 group_2       -2.3
#8      treatment2 ID4      0.3    1 group_2       -2.5
#9         control ID5      2.5    0 group_2        0.0
#10        control ID5      2.8    1 group_2        0.0
#11     treatment1 ID6      3.2    0 group_2        0.7
#12     treatment1 ID6      3.5    1 group_2        0.7

Or using split/unsplit

unsplit(
   lapply(split(df, list(df$Match, df$Time), drop=TRUE), function(x) 
     transform(x, Paired_sub = Response - Response[TreatmentGroup=='control'])), 
             list(df$Match, df$Time))

#   TreatmentGroup  ID Response Time   Match Paired_sub
#1      treatment2 ID1      1.2    0 group_1       -0.6
#2      treatment2 ID1      1.4    1 group_1       -0.6
#3         control ID2      1.8    0 group_1        0.0
#4         control ID2      2.0    1 group_1        0.0
#5      treatment1 ID3      1.5    0 group_1       -0.3
#6      treatment1 ID3      1.8    1 group_1       -0.2
#7      treatment2 ID4      0.2    0 group_2       -2.3
#8      treatment2 ID4      0.3    1 group_2       -2.5
#9         control ID5      2.5    0 group_2        0.0
#10        control ID5      2.8    1 group_2        0.0
#11     treatment1 ID6      3.2    0 group_2        0.7
#12     treatment1 ID6      3.5    1 group_2        0.7

Another option in base R would be to merge the df with subset of df with control TreatmentGroup

 df$id <- 1:nrow(df)
 df1 <- merge(df[,-c(1:2)], subset(df, TreatmentGroup=='control')[,-c(1:2,6)],
               by=c('Time', 'Match'), sort=FALSE)

 df$Paired_sub <- with(df1[order(df1$id),], Response.x-Response.y)
 df$Paired_sub
 #[1] -0.6 -0.6  0.0  0.0 -0.3 -0.2 -2.3 -2.5  0.0  0.0  0.7  0.7

Upvotes: 2

talat
talat

Reputation: 70276

Many options to do this, one of them is with dplyr:

require(dplyr)
df %>% 
  group_by(Match, Time) %>%
  mutate(Paired_sub = Response - Response[TreatmentGroup == "control"])

#Source: local data frame [12 x 6]
#Groups: Match, Time
#
#  TreatmentGroup  ID Response Time   Match Paired_sub
#1      treatment2 ID1      1.2    0 group_1       -0.6
#2      treatment2 ID1      1.4    1 group_1       -0.6
#3         control ID2      1.8    0 group_1        0.0
#4         control ID2      2.0    1 group_1        0.0
#5      treatment1 ID3      1.5    0 group_1       -0.3
#6      treatment1 ID3      1.8    1 group_1       -0.2
#7      treatment2 ID4      0.2    0 group_2       -2.3
#8      treatment2 ID4      0.3    1 group_2       -2.5
#9         control ID5      2.5    0 group_2        0.0
#10        control ID5      2.8    1 group_2        0.0
#11     treatment1 ID6      3.2    0 group_2        0.7
#12     treatment1 ID6      3.5    1 group_2        0.7

The equivalent data.table approach would be:

require(data.table)
setDT(df)[, Paired_sub := Response - Response[TreatmentGroup == "control"], by = list(Match, Time)]

One more option, using base R:

df <- do.call(rbind, lapply(split(df, interaction(df$Match, df$Time)), function(dd) {
  dd$Paired_sub <- with(dd, Response - Response[TreatmentGroup == "control"])
  dd}))
rownames(df) <- NULL

Only the ordering should be different here, the numbers in paired_sub are hopefully the same as in the other answers.

Upvotes: 5

Related Questions