user24318
user24318

Reputation: 485

combine two rows with missing values in R

I have a dataset with id and measurements taken at time. Some measurements are taken at time0 while some others are taken at time1. That results in some missing values. I want to combine rows with time0 and time 1 since both are baseline measurements and a new dataset has time starting from 1.Basically merge time0 and time1 for each id. Cannot think of a way to do that. To show what my data looks like, here is some simulated data.

set.seed(234)

 N=3
 t<-sample(2:6,N,replace=TRUE)
 id<-c(rep(1:N,t))
 n<-length(id)
 x<-as.matrix(cbind(a=rnorm(n,0,1),b=rnorm(n,0,1),c=rnorm(n,0,1),d=rnorm(n,0,1),e=rn
orm(n,0,1)))

time<-c(rbind(as.matrix(c(1:t[1]+1)),as.matrix(c(1:t[2]+1)),as.matrix(c(1:t[3]+1))))

x1<-cbind(id,time,x)

 ######### Add missing data

x2<-rbind(x1,c(1,0,0.98,NA,NA,0.71,0.85))
x3<-rbind(x2,c(1,1,NA,0.85,0.62,NA,0.85))
x4<-rbind(x3,c(2,0,0.81,NA,NA,0.68,0.87))
x5<-rbind(x4,c(2,1,NA,0.97,0.83,NA,0.85))
x6<-rbind(x5,c(3,0,0.87,NA,NA,0.72,0.83))
x7<-rbind(x6,c(3,1,NA,0.98,0.71,NA,0.86))

# create a new dataframe with missing

  newx<-x7[order(x7[,1],x7[,2]),]

  newx
       id time          a          b           c           d            e
  [1,]  1    0  0.9800000         NA          NA  0.71000000  0.850000000
  [2,]  1    1         NA  0.8500000  0.62000000          NA  0.850000000
  [3,]  1    2  0.7590390 -0.8716028 -0.30554099 -0.30528521  0.030963334
  [4,]  1    3  0.3713058  1.1876234  0.86956546 -0.28108275  0.669563187
  [5,]  1    4  0.5758514 -0.6672287 -1.06121591 -1.16458396 -0.140668367
  [6,]  1    5 -0.5703207  0.5383396 -0.09635967  0.09034109  1.281077794
  [7,]  1    6  0.1198567  0.4905632  0.47460932  1.01451692 -0.621039707
  [8,]  2    0  0.8100000         NA          NA  0.68000000  0.870000000
  [9,]  2    1         NA  0.9700000  0.83000000          NA  0.850000000
 [10,]  2    2  0.2095484 -1.0216529 -0.02671707  0.37160636  0.160315383
 [11,]  2    3 -0.1481357 -0.3726091  1.10167492  1.70677625 -0.860442148
 [12,]  2    4  0.6433900  1.3251178 -0.26842418  0.92790039  0.318602469
 [13,]  2    5  1.1348350 -0.7313432  0.01035965  1.05747589 -1.829611181
 [14,]  2    6  0.1995994  0.7625386  0.25897152 -1.05112649 -1.121045817
 [15,]  3    0  0.8700000         NA          NA  0.72000000  0.830000000
 [16,]  3    1         NA  0.9800000  0.71000000          NA  0.860000000
 [17,]  3    2  0.2987197  0.3275333 -0.39459737  2.48875683  0.002293782
 [18,]  3    3 -0.3191671 -1.1440187 -0.48873668 -0.32581308 -0.289496481

Upvotes: 0

Views: 256

Answers (2)

akrun
akrun

Reputation: 887153

We can also use data.table

library(data.table)
library(zoo)
as.data.table(newx)[time!=1, na.locf(.SD, fromLast = TRUE), by = id][time==0, time := 1][]
# id time          a          b           c           d            e
# 1:  1    1  0.9800000 -0.8716028 -0.30554099  0.71000000  0.850000000
# 2:  1    2  0.7590390 -0.8716028 -0.30554099 -0.30528521  0.030963334
# 3:  1    3  0.3713058  1.1876234  0.86956546 -0.28108275  0.669563187
# 4:  1    4  0.5758514 -0.6672287 -1.06121591 -1.16458396 -0.140668367
# 5:  1    5 -0.5703207  0.5383396 -0.09635967  0.09034109  1.281077794
# 6:  1    6  0.1198567  0.4905632  0.47460932  1.01451692 -0.621039707
# 7:  2    1  0.8100000 -1.0216529 -0.02671707  0.68000000  0.870000000
# 8:  2    2  0.2095484 -1.0216529 -0.02671707  0.37160636  0.160315383
# 9:  2    3 -0.1481357 -0.3726091  1.10167492  1.70677625 -0.860442148
#10:  2    4  0.6433900  1.3251178 -0.26842418  0.92790039  0.318602469
#11:  2    5  1.1348350 -0.7313432  0.01035965  1.05747589 -1.829611181
#12:  2    6  0.1995994  0.7625386  0.25897152 -1.05112649 -1.121045817
#13:  3    1  0.8700000  0.3275333 -0.39459737  0.72000000  0.830000000
#14:  3    2  0.2987197  0.3275333 -0.39459737  2.48875683  0.002293782
#15:  3    3 -0.3191671 -1.1440187 -0.48873668 -0.32581308 -0.289496481

Upvotes: 1

Andrew Lavers
Andrew Lavers

Reputation: 4378

I am not sure if this is what you want since you don't show expected results. This uses na.locf from package zoo to roll the measurements backwards (fromLast = TRUE) filling in an NA with the value that follows. Using package dplyr for the group_by and mutate_all which operates on dataframes

library(dplyr)
library(zoo)
newx %>% 
  data.frame() %>% 
  group_by(id) %>% 
  mutate_all(na.locf, fromLast = TRUE) %>%
  filter(time != 1) %>%
  mutate(time = if_else(time == 0, 1, time))

#       id  time          a          b           c           d            e
# 1      1     1  0.9800000  0.8500000  0.62000000  0.71000000  0.850000000
# 2      1     2  0.7590390 -0.8716028 -0.30554099 -0.30528521  0.030963334
# 3      1     3  0.3713058  1.1876234  0.86956546 -0.28108275  0.669563187
# 4      1     4  0.5758514 -0.6672287 -1.06121591 -1.16458396 -0.140668367
# 5      1     5 -0.5703207  0.5383396 -0.09635967  0.09034109  1.281077794
# 6      1     6  0.1198567  0.4905632  0.47460932  1.01451692 -0.621039707
# 7      2     1  0.8100000  0.9700000  0.83000000  0.68000000  0.870000000
# 8      2     2  0.2095484 -1.0216529 -0.02671707  0.37160636  0.160315383
# 9      2     3 -0.1481357 -0.3726091  1.10167492  1.70677625 -0.860442148
# 10     2     4  0.6433900  1.3251178 -0.26842418  0.92790039  0.318602469
# 11     2     5  1.1348350 -0.7313432  0.01035965  1.05747589 -1.829611181
# 12     2     6  0.1995994  0.7625386  0.25897152 -1.05112649 -1.121045817
# 13     3     1  0.8700000  0.9800000  0.71000000  0.72000000  0.830000000
# 14     3     2  0.2987197  0.3275333 -0.39459737  2.48875683  0.002293782
# 15     3     3 -0.3191671 -1.1440187 -0.48873668 -0.32581308 -0.289496481

Upvotes: 3

Related Questions