Reputation: 623
I have a dataframe consisting of three columns: x, ID and date_time. The “x” column is a recording of a variable x, ID indicate what is being recorded, while date_time indicates when. See a piece of the dataframe below.
From this dataframe I would like to calculate a new dataframe that has seven columns: "Measurement", "ID" and "Date", “x_4_10_day”, “Day_total”, “x_4_10_night”, “Night_total”.
There should be a row for every unique measurement. So far I have a code that returns the columns: "Measurement", "ID" and "Date” correctly:
df1$mydate = as.Date(df1$date_time, format = "%Y.%m.%d %H:%M:%S")
df1$tm <- as.numeric(df1$date_time)
df1$dts <- 86400*as.numeric(df1$mydate)
df2 <- df1 %>%
group_by(ID,mydate) %>%
transform(date = case_when(((dts-3600)<tm & tm<(dts+82800)) ~paste0(mydate), ((dts+82800)<=tm) ~paste0(mydate+1) )) %>%
select(ID,date) %>%
unique() %>%
group_by(ID) %>%
mutate(measurement = row_number())
however I don’t know how to do the last ones.
Here’s an expected output:
dummy_output <- read.table(header=TRUE, text ="
ID Date Measurement x_4_10_day Day_total x_4_10_night Night_total
12 2020.03.02 1 30 40 0 0
12 2020.03.03 2 0 0 45 75
13 2020.05.09 1 90 90 0 0
")
Any suggestions are much appreciated, thanks!
And here's the data:
structure(list(date_time = c("2020.03.02 22:00:17", "2020.03.02 22:05:17",
"2020.03.02 22:10:17", "2020.03.02 22:35:17", "2020.03.02 22:40:17",
"2020.03.02 22:45:17", "2020.03.02 22:50:17", "2020.03.02 22:55:17",
"2020.03.02 23:00:17", "2020.03.02 23:05:17", "2020.03.02 23:10:17",
"2020.03.02 23:15:17", "2020.03.02 23:20:17", "2020.03.02 23:25:17",
"2020.03.02 23:30:17", "2020.03.02 23:35:17", "2020.03.02 23:40:17",
"2020.03.02 23:45:17", "2020.03.02 23:50:17", "2020.03.02 23:55:17",
"2020.03.03 00:00:17", "2020.03.03 00:55:17", "2020.03.03 01:00:17",
"2020.03.03 01:05:17", "2020.03.03 01:10:17", "2020.03.03 01:15:17",
"2020.03.03 01:20:17", "2020.03.03 01:25:17", "2020.05.09 08:39:32",
"2020.05.09 08:39:32", "2020.05.09 08:39:32", "2020.05.09 08:39:32",
"2020.05.09 08:39:32", "2020.05.09 08:39:32", "2020.05.09 08:39:32",
"2020.05.09 08:39:32", "2020.05.09 08:39:32", "2020.05.09 08:39:32",
"2020.05.09 08:39:32", "2020.05.09 08:39:32", "2020.05.09 08:39:32",
"2020.05.09 08:39:32", "2020.05.09 08:39:32", "2020.05.09 08:39:32",
"2020.05.09 08:39:32", "2020.05.09 08:39:32"), id = c(12L, 12L,
12L, 12L, 12L, 12L, 12L, 12L, 12L, 12L, 12L, 12L, 12L, 12L, 12L,
12L, 12L, 12L, 12L, 12L, 12L, 12L, 12L, 12L, 12L, 12L, 12L, 12L,
13L, 13L, 13L, 13L, 13L, 13L, 13L, 13L, 13L, 13L, 13L, 13L, 13L,
13L, 13L, 13L, 13L, 13L), x = c("7.55", "4.55", "4.55", "12",
"12", "10", "10", "4.3", "", "", "4.3", "4.3", "4.3", "", "4.3",
"12", "12", "12", "2", "12", "12", "", "8", "3", "3", "2", "2",
"", "12", "10", "10", "4.3", "4.3", "4.3", "4.3", "4.3", "4.3",
"4.3", "4.3", "12", "12", "12", "12", "12", "12", "12")), row.names = c(NA,
46L), class = "data.frame")
Upvotes: 0
Views: 119
Reputation: 26218
It took me some time but probably you want this
sample data (changed a bit as date/time in 13
were all same
df <- structure(list(date_time = c("2020.03.02 22:00:17", "2020.03.02 22:05:17",
"2020.03.02 22:10:17", "2020.03.02 22:35:17", "2020.03.02 22:40:17",
"2020.03.02 22:45:17", "2020.03.02 22:50:17", "2020.03.02 22:55:17",
"2020.03.02 23:00:17", "2020.03.02 23:05:17", "2020.03.02 23:10:17",
"2020.03.02 23:15:17", "2020.03.02 23:20:17", "2020.03.02 23:25:17",
"2020.03.02 23:30:17", "2020.03.02 23:35:17", "2020.03.02 23:40:17",
"2020.03.02 23:45:17", "2020.03.02 23:50:17", "2020.03.02 23:55:17",
"2020.03.03 00:00:17", "2020.03.03 00:55:17", "2020.03.03 01:00:17",
"2020.03.03 01:05:17", "2020.03.03 01:10:17", "2020.03.03 01:15:17",
"2020.03.03 01:20:17", "2020.03.03 01:25:17", "2020.05.09 08:39:32",
"2020.05.09 08:44:32", "2020.05.09 08:49:32", "2020.05.09 08:54:32",
"2020.05.09 08:59:32", "2020.05.09 09:39:32", "2020.05.09 09:44:32",
"2020.05.09 09:49:32", "2020.05.09 09:59:32", "2020.05.09 10:39:32",
"2020.05.09 11:39:32", "2020.05.09 12:39:32", "2020.05.09 13:39:32",
"2020.05.09 14:39:32", "2020.05.09 15:39:32", "2020.05.09 16:39:32",
"2020.05.09 17:39:32", "2020.05.09 18:39:32"), id = c(12L, 12L,
12L, 12L, 12L, 12L, 12L, 12L, 12L, 12L, 12L, 12L, 12L, 12L, 12L,
12L, 12L, 12L, 12L, 12L, 12L, 12L, 12L, 12L, 12L, 12L, 12L, 12L,
13L, 13L, 13L, 13L, 13L, 13L, 13L, 13L, 13L, 13L, 13L, 13L, 13L,
13L, 13L, 13L, 13L, 13L), x = c("7.55", "4.55", "4.55", "12",
"12", "10", "10", "4.3", "", "", "4.3", "4.3", "4.3", "", "4.3",
"12", "12", "12", "2", "12", "12", "", "8", "3", "3", "2", "2",
"", "12", "10", "10", "4.3", "4.3", "4.3", "4.3", "4.3", "4.3",
"4.3", "4.3", "12", "12", "12", "12", "12", "12", "12")), row.names = c(NA,
46L), class = "data.frame")
Edited Result
library(tidyverse)
library(lubridate)
df %>% as_tibble() %>%
transform(x = as.numeric(x),
date_time = as_datetime(date_time),
id = as.character(id)) %>%
mutate(d_n = ifelse(hour(date_time)>=7 & hour(date_time)<23, 'day', 'night'),
Date = as.Date(date_time, format = "%Y.%m.%d %H:%M:%S"),
valid_m = ifelse(x>=4 & x<= 10, 1, 0)) %>%
mutate(valid_m = ifelse(is.na(valid_m), 0, valid_m)) %>% #valid measurements
arrange(id, date_time) %>%
group_by(id) %>%
mutate(validm_d = as.numeric(lead(date_time)-date_time)) %>%
filter(!is.na(validm_d)) %>%
group_by(id, Date, d_n, valid_m) %>%
summarise(x_tm = sum(validm_d)) %>%
ungroup() %>%
pivot_wider(names_from = d_n, values_from = x_tm, values_fill =0) %>%
group_by(id, Date) %>%
mutate(day_t = sum(day), night_t = sum(night)) %>%
filter(valid_m != 0) %>%
group_by(id) %>%
mutate(measurement = row_number()) %>%
select(id, measurement, Date, x_4_10_day =day, x_4_10_total =day_t,
x_4_10_night =night, x_4_10_totaln = night_t)
desired_result
id measurement Date x_4_10_day x_4_10_total x_4_10_night x_4_10_totaln
<chr> <int> <date> <dbl> <dbl> <dbl> <dbl>
1 12 1 2020-03-02 50 60 20 60
2 12 2 2020-03-03 0 0 5 85
3 13 1 2020-05-09 235 600 0 0
In this solution I have removed last value of each measurement as I wasn't sure how long that measurement was to be taken. You can change the code appropriately. Basically last measurement of 'day' ended 2300 hrs therefore results of first row should have been 17 seconds fewer than those shown.
Upvotes: 1
Reputation: 21297
I have added id=14
with only night values to your dataframe. Perhaps this is what you are looking for. Please note that your expected values do not comply with your requirements fully.
df11 <- structure(list(date_time = c("2020.03.02 22:00:17", "2020.03.02 22:05:17",
"2020.03.02 22:10:17", "2020.03.02 22:35:17", "2020.03.02 22:40:17",
"2020.03.02 22:45:17", "2020.03.02 22:50:17", "2020.03.02 22:55:17",
"2020.03.02 23:00:17", "2020.03.02 23:05:17", "2020.03.02 23:10:17",
"2020.03.02 23:15:17", "2020.03.02 23:20:17", "2020.03.02 23:25:17",
"2020.03.02 23:30:17", "2020.03.02 23:35:17", "2020.03.02 23:40:17",
"2020.03.02 23:45:17", "2020.03.02 23:50:17", "2020.03.02 23:55:17",
"2020.03.03 00:00:17", "2020.03.03 00:55:17", "2020.03.03 01:00:17",
"2020.03.03 01:05:17", "2020.03.03 01:10:17", "2020.03.03 01:15:17",
"2020.03.03 01:20:17", "2020.03.03 01:25:17", "2020.05.09 08:39:32",
"2020.05.09 08:39:32", "2020.05.09 08:39:32", "2020.05.09 08:39:32",
"2020.05.09 08:39:32", "2020.05.09 08:39:32", "2020.05.09 08:39:32",
"2020.05.09 08:39:32", "2020.05.09 08:39:32", "2020.05.09 08:39:32",
"2020.05.09 08:39:32", "2020.05.09 08:39:32", "2020.05.09 08:39:32",
"2020.05.09 08:39:32", "2020.05.09 08:39:32", "2020.05.09 08:39:32",
"2020.05.09 08:39:32", "2020.05.09 08:39:32",
"2020.03.02 23:45:17", "2020.03.02 23:50:17", "2020.03.02 23:55:17",
"2020.03.03 00:00:17", "2020.03.03 00:55:17", "2020.03.03 01:00:17"
),
x = c("7.55", "4.55", "4.55", "12",
"12", "10", "10", "4.3", "", "", "4.3", "4.3", "4.3", "", "4.3",
"12", "12", "12", "2", "12", "12", "", "8", "3", "3", "2", "2",
"", "12", "10", "10", "4.3", "4.3", "4.3", "4.3", "4.3", "4.3",
"4.3", "4.3", "12", "12", "12", "12", "12", "12", "12",
"12", "10", "10", "4.3", "4.3", "4.3"),
id = c(12L, 12L,
12L, 12L, 12L, 12L, 12L, 12L, 12L, 12L, 12L, 12L, 12L, 12L, 12L,
12L, 12L, 12L, 12L, 12L, 12L, 12L, 12L, 12L, 12L, 12L, 12L, 12L,
13L, 13L, 13L, 13L, 13L, 13L, 13L, 13L, 13L, 13L, 13L, 13L, 13L,
13L, 13L, 13L, 13L, 13L, 14L, 14L, 14L, 14L, 14L, 14L)),
row.names = c(NA, 52L), class = "data.frame")
df11$xn <- as.numeric(df11$x)
df1 <- df11 %>% transform(xmin = ifelse((xn<4 | xn>10 | is.na(xn)),0,5 ),
xmint = ifelse(is.na(xn),-5,5 ))
df1$dateTime = as_datetime(df1$date_time, format = "%Y.%m.%d %H:%M:%S")
df1$mydate = as.Date(df1$date_time, format = "%Y.%m.%d %H:%M:%S")
df1$tm <- as.numeric(df1$dateTime)
df1$dts <- 86400*as.numeric(df1$mydate)
df2 <- df1 %>% group_by(id,mydate) %>%
transform(date = case_when(((dts-3600)<tm & tm<(dts+82800) )~paste0(mydate),((dts+82800)<=tm)~paste0(mydate+1) )) %>%
transform(dayrnight = ifelse((tm>=(dts+25200) & tm<(dts+82800) ),'day','night' ) ) %>%
group_by(id,date,dayrnight) %>%
dplyr::summarise(x_4_10 = sum(xmin), total = sum(xmint)) %>%
pivot_wider(id_cols = c(id,date), names_from = dayrnight, values_from = c("x_4_10", "total")) %>%
mutate_if(is.numeric , replace_na, replace = 0) %>%
group_by(id) %>% mutate(measurement = row_number()) %>%
select(id,date,measurement,x_4_10_day,total_day,x_4_10_night,total_night)
> df2
# A tibble: 4 x 7
# Groups: id [3]
id date measurement x_4_10_day total_day x_4_10_night total_night
<int> <chr> <int> <dbl> <dbl> <dbl> <dbl>
1 12 2020-03-02 1 30 40 0 0
2 12 2020-03-03 2 0 0 25 50
3 13 2020-05-09 1 50 90 0 0
4 14 2020-03-03 1 0 0 25 30
Upvotes: 1