Reputation: 35
I'm trying to find a cumulative sum for two separate groups, and to have each of these sums listed in separate columns.
Here is the data frame, sorted based on time:
time group value
0 A 0
0 B 0
0 A 0
1 A 0
1 B 1
1 B 0
2 B 1
2 A 1
2 A 1
2 A -1
3 A 0
3 B 1
This is what I have to find cumsum by group, and to create the cumsum column:
df$cumsum <- ave(df$value, df$group, FUN=cumsum)
time group value cumsum
0 A 0 0
0 B 0 0
0 A 0 0
1 A 0 0
1 B 1 1
1 B 0 1
2 B 1 2
2 A 1 1
2 A 1 2
2 A -1 1
3 A 0 1
3 B 1 3
How is it possible to separate the results into two columns, one for A and one for B? Alternatively, would it be possible to find a conditional cumsum? Either way, I want the results to look like this:
time group value cumsum_A cumsum_B
0 A 0 0 0
0 B 0 0 0
0 A 0 0 0
1 A 0 0 0
1 B 1 0 1
1 B 0 0 1
2 B 1 0 2
2 A 1 1 2
2 A 1 2 2
2 A -1 1 2
3 A 0 1 2
3 B 1 1 3
Thanks!
Upvotes: 2
Views: 184
Reputation: 15072
You can also use if_else
to replace value
with 0
when it is not part of the desired group as below. dplyr
is not necessary here (use base::ifelse
and avoid mutate
)
library(tidyverse)
df1 <- structure(list(time = c(0L, 0L, 0L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 3L, 3L), group = c("A", "B", "A", "A", "B", "B", "B", "A", "A", "A", "A", "B"), value = c(0L, 0L, 0L, 0L, 1L, 0L, 1L, 1L, 1L, -1L, 0L, 1L)), class = "data.frame", row.names = c(NA, -12L))
df1 %>%
mutate(
cumsum_A = cumsum(if_else(group == "A", value, 0L)),
cumsum_B = cumsum(if_else(group == "B", value, 0L))
)
#> time group value cumsum_A cumsum_B
#> 1 0 A 0 0 0
#> 2 0 B 0 0 0
#> 3 0 A 0 0 0
#> 4 1 A 0 0 0
#> 5 1 B 1 0 1
#> 6 1 B 0 0 1
#> 7 2 B 1 0 2
#> 8 2 A 1 1 2
#> 9 2 A 1 2 2
#> 10 2 A -1 1 2
#> 11 3 A 0 1 2
#> 12 3 B 1 1 3
Created on 2019-06-25 by the reprex package (v0.3.0)
Upvotes: 1
Reputation: 887058
Here is an option with table
and colCumsums
library(matrixStats)
nm1 <- paste0("cumsum_", unique(df1$group))
df1[nm1] <- colCumsums(table(seq_len(nrow(df1)),df1$group) * df1$value)
df1
# time group value cumsum_A cumsum_B
#1 0 A 0 0 0
#2 0 B 0 0 0
#3 0 A 0 0 0
#4 1 A 0 0 0
#5 1 B 1 0 1
#6 1 B 0 0 1
#7 2 B 1 0 2
#8 2 A 1 1 2
#9 2 A 1 2 2
#10 2 A -1 1 2
#11 3 A 0 1 2
#12 3 B 1 1 3
Or another option is model.matrix
colCumsums((model.matrix(~ group -1, df1)) * df1$value)
Or the model.matrix
with tidyverse
library(tidyverse)
df1 %>%
model.matrix( ~group - 1, .) %>%
as_tibble %>%
mutate_all(~ cumsum(. * df1$value)) %>%
rename_all(~ str_replace(., "group", "cumsum")) %>%
bind_cols(df1, .)
# time group value cumsumA cumsumB
#1 0 A 0 0 0
#2 0 B 0 0 0
#3 0 A 0 0 0
#4 1 A 0 0 0
#5 1 B 1 0 1
#6 1 B 0 0 1
#7 2 B 1 0 2
#8 2 A 1 1 2
#9 2 A 1 2 2
#10 2 A -1 1 2
#11 3 A 0 1 2
#12 3 B 1 1 3
Or using count
along with spread
df1 %>%
mutate(rn = row_number()) %>%
dplyr::count(group, rn) %>%
mutate(group = str_c("cumsum", group)) %>%
spread(group, n, fill = 0) %>%
mutate_at(-1, ~ cumsum(. * df1$value)) %>%
select(-rn) %>%
bind_cols(df1, .)
df1 <- structure(list(time = c(0L, 0L, 0L, 1L, 1L, 1L, 2L, 2L, 2L, 2L,
3L, 3L), group = c("A", "B", "A", "A", "B", "B", "B", "A", "A",
"A", "A", "B"), value = c(0L, 0L, 0L, 0L, 1L, 0L, 1L, 1L, 1L,
-1L, 0L, 1L)), class = "data.frame", row.names = c(NA, -12L))
Upvotes: 0
Reputation: 388962
You could first find out the unique
values and using sapply
/lapply
loop over them to conditionally calculate cumsum
for each one of them.
unique_val <- unique(df$group)
df[paste0("cumsum_", unique_val)] <- lapply(unique_val,
function(x) cumsum((df$group == x) * df$value))
df
# time group value cumsum_A cumsum_B
#1 0 A 0 0 0
#2 0 B 0 0 0
#3 0 A 0 0 0
#4 1 A 0 0 0
#5 1 B 1 0 1
#6 1 B 0 0 1
#7 2 B 1 0 2
#8 2 A 1 1 2
#9 2 A 1 2 2
#10 2 A -1 1 2
#11 3 A 0 1 2
#12 3 B 1 1 3
Upvotes: 1