Reputation: 361
df1:
library(tidyverse)
library(lubridate)
ex1 <- tibble(date = seq.Date(from = ymd('20200101'), length.out = 100, by = 'day'),
a = rnorm(100, mean = 1, sd = 2),
b = runif(100, min = 1, max = 2),
c = rnorm(100, mean = 3, sd = 1),
d = runif(100, min = 50, max = 60))
df2:
cal_c <- tibble(variable = c('a', 'b', 'c','d'),
start = c(ymd('20200101', '20200103', '20200203', '20200103')),
end = c(ymd('20200204', '20200405', '20200301', '20200401')),
total = c('NA', 'NA', 'NA', 'NA'))
I want to calc every row in df2 within the date range in the start and end based on df1, say a$toal between '2020-1-1' to '2020-2-4', b$total between '2020-1-3' to '2020-4-5', any help, thanks a lot.
Upvotes: 0
Views: 211
Reputation: 25225
An option using data.table
:
cal_c[, total :=
ex1[cal_c, on=.(date>=start, date<=end), by=.EACHI,
sum(.SD[[variable]])]$V1
]
output:
variable start end total
1: a 2020-01-01 2020-02-04 34.04780
2: b 2020-01-03 2020-04-05 135.40290
3: c 2020-02-03 2020-03-01 91.10271
4: d 2020-01-03 2020-04-01 4978.59884
data:
set.seed(0L)
library(data.table)
ex1 <- data.table(date = seq.Date(from = as.IDate('20200101', format="%Y%m%d"), length.out = 100, by = 'day'),
a = rnorm(100, mean = 1, sd = 2),
b = runif(100, min = 1, max = 2),
c = rnorm(100, mean = 3, sd = 1),
d = runif(100, min = 50, max = 60))
cal_c <- data.table(variable = c('a', 'b', 'c','d'),
start = as.IDate(c('20200101', '20200103', '20200203', '20200103'), format="%Y%m%d"),
end = as.IDate(c('20200204', '20200405', '20200301', '20200401'), format="%Y%m%d"))
Upvotes: 0
Reputation: 5798
Base R Solution:
cal_c$total <- sapply(split(cal_c, rownames(cal_c)), function(x){
sum(ex1[((ex1$date >= x$start) & (ex1$date <= x$end)), match(x$variable, names(ex1))])})
Upvotes: 0
Reputation: 389175
We can create a sequence of start
and end
dates for cal_c
data, get ex1
in long format and join. We can then sum
value
for each variable
.
library(tidyverse)
cal_c %>%
mutate(date = map2(start, end, seq, by = 'day')) %>%
unnest(date) %>%
left_join(ex1 %>% pivot_longer(cols = -date, names_to = 'variable'),
by = c('variable', 'date')) %>%
group_by(variable, start, end) %>%
summarise(value = sum(value, na.rm = TRUE))
# variable start end value
# <chr> <date> <date> <dbl>
#1 a 2020-01-01 2020-02-04 34.3
#2 b 2020-01-03 2020-04-05 136.
#3 c 2020-02-03 2020-03-01 79.5
#4 d 2020-01-03 2020-04-01 4909.
Upvotes: 1