Reputation: 57
I have this data frame:
library(dplyr)
library(tidyr)
data <- tribble(
~Date, ~A1, ~A2,~B1,~B2,
as.Date("2019-01-01"), 20, 10,20, 10,
as.Date("2019-01-01"), 20 ,5,20,5,
as.Date("2019-01-01"), 10, 2,10,20,
as.Date("2019-01-01"), 20, 60,0,0,
as.Date("2019-01-01"), 30, 4,20,5,
as.Date("2019-02-01"), 0, 0,16,8,
as.Date("2019-02-01"), 0, 0,0,40,
as.Date("2019-02-01"), 0, 0,4,2,
as.Date("2019-02-01"), 4, 8,10,6,
as.Date("2019-02-01"), 6, 3,0,0,
as.Date("2019-03-01"), 20, 8,23,9,
as.Date("2019-03-01"), 60, 4,0,0,
as.Date("2019-03-01"), 4, 2,8,3,
as.Date("2019-03-01"), 0, 6,10,0
)
For each day I want to calculate the mean of (A1-B1) and of (A2-B2).
For A1-B1 I want to use only rows where A1>B1 and A1>0,B1>0.
For A2-B2 I want to use only rows where A2>B2 and A2>0,B2>0.
This is what I have tried:
data_mean = data %>%
group_by(Date) %>%
dplyr::summarise(
mean_1 = mean(A1[A1>=B1 & A1>0 & B1>0] - B1[A1>=B1 & A1>0 & B1>0]),
mean_2 = mean(A2[A2>=B2 & A2>0 & B2>0] - B2[A2>=B2 & A2>0 & B2>0]))
Is there a way to use the filter function while using summarise function? Or a smarter way to apply my code?
Upvotes: 3
Views: 1428
Reputation: 78917
Update: Thank to akrun!!! Now it works!
data %>%
filter(if_all(where(is.numeric), ~ . > 0)) %>%
mutate(i1 = A1 >= B1, i2 = A2 >= B2) %>%
group_by(Date) %>%
summarise(mean1 = mean(A1[i1] - B1[i1]), mean2 = mean(A2[i2] - B2[i2]))
Output:
Date mean1 mean2
<date> <dbl> <dbl>
1 2019-01-01 2.5 0
2 2019-02-01 NaN 2
3 2019-03-01 NaN NaN
First version I get almost akrun's solution. But can't handle the minus numbers
data %>%
group_by(Date) %>%
filter_if(is.numeric, all_vars((.) != 0)) %>%
filter(A1>=B1 | A2>=B2) %>%
summarise(mean1 = mean(A1-B1),
mean2 = mean(A2-B2))
Output:
Date mean1 mean2
<date> <dbl> <dbl>
1 2019-01-01 2.5 -4.75
2 2019-02-01 -6 2
Upvotes: 5
Reputation: 887048
If we don't want to repeat the expressions, create a temporary column. Also, this can be done for multiple columns within across
library(dplyr)
library(stringr)
data %>%
group_by(Date) %>%
summarise(across(c(A1, A2), ~ {
tmp <- get(str_replace(cur_column(), 'A', 'B'))
i1 <- . >= tmp & . > 0 & tmp >0
mean(.[i1] - tmp[i1])})) %>%
rename_with(~ str_replace(., 'A', 'mean_'), -Date)
-output
# A tibble: 3 x 3
# Date mean_1 mean_2
#* <date> <dbl> <dbl>
#1 2019-01-01 2.5 0
#2 2019-02-01 NaN 2
#3 2019-03-01 NaN NaN
Or another option is to convert to 'long' format with pivot_longer
, do the filter/group_by/summarise
and then reshape to 'wide' with pivot_wider
library(tidyr)
data %>%
pivot_longer(cols = A1:B2, names_to = c('.value', 'grp'),
names_sep = '(?<=[A-Z])(?=\\d)') %>%
filter(A >= B, A > 0, B > 0) %>%
group_by(Date, grp = str_c('mean_', grp)) %>%
summarise(mean = mean(A - B), .groups = 'drop') %>%
pivot_wider(names_from = grp, values_from = mean) %>%
complete(Date = unique(data$Date))
# A tibble: 3 x 3
# Date mean_1 mean_2
# <date> <dbl> <dbl>
#1 2019-01-01 2.5 0
#2 2019-02-01 NA 2
#3 2019-03-01 NA NA
Upvotes: 5
Reputation: 21908
I hope this code gets you to your desired output:
library(dplyr)
data %>%
group_by(Date) %>%
filter(A1 >= B1 & B1 > 0 | A2 >= B2 & B2 > 0) %>%
mutate(sub1 = A1 - B1,
sub2 = A2 - B2) %>%
summarise(mean1 = mean(sub1),
mean2 = mean(sub2))
# A tibble: 2 x 3
Date mean1 mean2
<date> <dbl> <dbl>
1 2019-01-01 2.5 -4.75
2 2019-02-01 -6 2
Upvotes: 2