Reputation: 217

merge multiple dataframes based on matching timestamp

I have 6 dataframes, all with unique column names,the same amount of columns, and the data was collected over the same time period.

Each data frame has a time stamp and minute averages were taken, but some dataframes having missing data and the columns lengths are not equal.

I would like to merge the data frames to display all 6 data frames side by side, but only at times that data was present across all 6 dataframes, i.e. the df with the lowest numbers of columns, which is "H1_min"

> head(H1_min)
            h1min h1temp h1humid   h1db     h1hz
1 2015-09-06 00:00:00   21.5   73.10 39.252 117.1900
2 2015-09-06 00:02:00   21.5   72.50 39.434 125.0000
3 2015-09-06 00:03:00   21.5   72.65 39.338 127.9325
4 2015-09-06 00:04:00   21.5   73.00 39.206 148.4400
5 2015-09-06 00:06:00   21.5   73.00 39.253 144.5350
6 2015-09-06 00:07:00   21.5   72.30 39.293 156.2500

The colnames of the other dataframes are similar, but H1 = H2 thru H6.

dput(head(H2_min))

"2015-09-08 20:21:00", "2015-09-08 20:22:00", "2015-09-08 20:23:00", 
"2015-09-08 20:24:00", "2015-09-08 20:25:00", "2015-09-08 20:26:00", 
"2015-09-08 20:27:00", "2015-09-08 20:28:00", "2015-09-08 20:29:00", 
"2015-09-08 20:30:00", "2015-09-08 20:31:00", "2015-09-08 20:32:00", 
"2015-09-08 20:33:00", "2015-09-08 20:34:00", "2015-09-08 20:35:00"
), class = "factor"), h2temp = c(23.4, 23.4, 23.3, 23.2, 23.2, 
23.1), h2humid = c(38.5, 38.3, 38.05, 38.1, 38.6, 38.6), h2db = c(38.834, 
38.655, 38.679, 38.695, 38.806, 38.702), h2hz = c(191.41, 152.34, 
162.11, 113.28, 121.09, 164.06)), .Names = c("h2min", "h2temp", 
"h2humid", "h2db", "h2hz"), row.names = c(NA, 6L), class = "data.frame")

dput(head(H4_min))

"2015-09-08 17:10:00", "2015-09-08 17:11:00", "2015-09-08 17:12:00", 
"2015-09-08 17:13:00"), class = "factor"), h4temp = c(27.2, 27.2, 
27.2, 27.2, 27.2, 27.2), h4humid = c(33.5, 33.5, 33.5, 33.5, 
33.5, 33.5), h4db = c(36.8225, 36.921, 36.8766666666667, 36.91, 
36.8336666666667, 36.768), h4hz = c(134.765, 136.068333333333, 
137.373333333333, 126.3, 139.323333333333, 128.906666666667)), .Names =       
c("h4min", "h4temp", "h4humid", "h4db", "h4hz"), row.names = c(NA, 6L), class = "data.frame")

this attempt yields:

H_min<-merge(H1_min, H2_min, H3_min, H4_min, H5_min, H6_min, by.x = 'row.names', by.y ='h1_min')

Error in fix.by(by.y, y) : 'by' must specify a uniquely valid column

Upvotes: 0

Answers (3)

Tymo

Reputation: 259

A bit simpler way to fix this based on @jlhoward answer.

qxts1 <- xts(df1[,-1], order.by = df1[,1]) 
qxts2 <- xts(df2[,-1], order.by = df2[,1])

xts.lst = list(qxts1, qxts2)
result <- do.call(merge.xts, c(xts.lst, all=FALSE))
result <- data.frame(result)

For xts or zoo, make sure your TimeStamp is a vector or matrix carrying data as Date, POSIXct, chron, ...

Upvotes: 0

jlhoward

Reputation: 59355

Another way to do this is to convert the data.frames to xts objects and then use merge.xts(...), which merges based on the timestamp automatically, and then convert the result back to a data.frame.

Most of the code below is just to create reproducible sample data. The actual work is in the 6 lines at the end.

# create representative example - you have this already
time <- as.character(as.POSIXct("2015-09-06") + 60*(0:30))
temp = c(23.4, 23.4, 23.3, 23.2, 23.2, 23.1)
humid = c(38.5, 38.3, 38.05, 38.1, 38.6, 38.6)
db = c(38.834, 38.655, 38.679, 38.695, 38.806, 38.702)
hz = c(191.41, 152.34, 162.11, 113.28, 121.09, 164.06)
set.seed(123)   # for reproducible example
get.df <- function(n, name) {
  df <- data.frame(min=sort(sample(time,n)), 
                   temp=sample(temp,n, replace=TRUE), 
                   humid=sample(humid,n,replace=TRUE),
                   db = sample(db,n,replace=TRUE),
                   hz = sample(hz,n,replace=TRUE))
  names(df) <- paste0(name,names(df))
  df
}
H1 <- get.df(20,"h1")    # 20 rows at random times
H2 <- get.df(20,"h2")    # 20 rows at random times
H3 <- get.df(25,"h3")    # 25 rows at random times
H4 <- get.df(30,"h4")    # 30 rows at random times
# you start here
library(xts)
lst <- list(H1, H2, H3, H4)
xts.lst <- lapply(lst, function(df) xts(df[,2:ncol(df)], order.by=as.POSIXct(df[[1]])))
result <- do.call(merge.xts, c(xts.lst, all=FALSE))
result <- data.frame(result)
head(result)
#                     h1temp h1humid   h1db   h1hz h2temp h2humid   h2db   h2hz h3temp h3humid   h3db   h3hz h4temp h4humid   h4db   h4hz
# 2015-09-06 00:03:00   23.2   38.05 38.679 162.11   23.4    38.5 38.695 121.09   23.3    38.3 38.702 191.41   23.4    38.5 38.679 162.11
# 2015-09-06 00:04:00   23.1   38.05 38.655 121.09   23.4    38.3 38.679 152.34   23.2    38.1 38.679 121.09   23.1    38.3 38.834 121.09
# 2015-09-06 00:09:00   23.2   38.50 38.679 162.11   23.4    38.5 38.655 113.28   23.3    38.3 38.834 191.41   23.4    38.6 38.655 191.41
# 2015-09-06 00:12:00   23.4   38.30 38.806 164.06   23.4    38.3 38.679 164.06   23.4    38.6 38.834 162.11   23.4    38.3 38.655 121.09
# 2015-09-06 00:13:00   23.4   38.60 38.679 152.34   23.2    38.6 38.655 164.06   23.3    38.6 38.679 162.11   23.4    38.5 38.679 121.09
# 2015-09-06 00:14:00   23.1   38.50 38.806 191.41   23.2    38.6 38.695 152.34   23.4    38.6 38.834 162.11   23.3    38.5 38.834 191.41

Upvotes: 2

bramtayl

Reputation: 4024

library(dplyr)
library(magrittr)
library(tidyr)

H1_min = 
  data_frame(
    h1min = c("2015-09-06 00:00:00", "2015-09-06 00:02:00"),
    h1temp = c(21.5, 21.5),
    h1humid = c(73.10, 72.50),
    h1db = c(39.252, 39.434),
    h1hz = c(117.1900, 125.000) )

H2_min = H1_min %>% mutate(h1hz = c(117.1900, NA))

answer = 
  list(H1_min, H2_min) %>%
  lapply(. %>% setNames(c("min",
                          "temp",
                          "humid",
                          "db",
                          "hz"))) %>%
  bind_rows(.id = "location") %>%
  gather(variable, value, -location, -min) %>%
  mutate(prefix = "h") %>%
  unite(new_variable, prefix, location, variable, sep = "") %>%
  spread(new_variable, value) %>%
  filter(complete.cases(.))

Upvotes: 0

merge multiple dataframes based on matching timestamp

Answers (3)

Related Questions