Reputation: 583
I have a problem where I need to reshape a long format data table into a wide format with non-overlapping entries based on ID1 and ID2. The logic is quite complex and depends on 3 columns ("Seq, "ID1" and "ID2").
Value_1 belonging to ID1 should be summed if it 'overlaps' with ID2 and vice-versa but only for distinct ID's.
See below for an input example and output, hope that clarifies it.
input:
df <- structure(list(Seq = c(9143L, 916L, 9293L, 9301L, 9302L, 9304L,
9305L, 9306L, 9307L, 931L, 9311L), ID1 = c("ID1_1", "ID1_1",
NA, "ID1_2", "ID1_2", NA, "ID1_3", "ID1_3", "ID1_3", "ID1_4",
"ID1_4"), value_1 = c(30L, 30L, NA, 30L, 30L, NA, 30L, 30L, 30L,
50L, 50L), ID2 = c(NA, NA, "ID2_1", "ID2_2", "ID2_3", "ID2_4",
"ID2_4", "ID2_4", "ID2_4", "ID2_4", "ID2_5"), value_2 = c(NA,
NA, 33L, 200L, 46L, 58L, 58L, 58L, 58L, 58L, 46L)), class = "data.frame", row.names = c(NA,
-11L))
output:
(notice for example the last row, value_1 = 80 because 30+50 from summing up the values belonging to ID1_3 and ID1_4)
Upvotes: 2
Views: 143
Reputation: 5788
Not as succinct as above, but a Base R solution none the less:
# Function to calculate the aggregate value: .agg_func => function()
.agg_func <- function(df, id_col, value_col){
sbst <- subset(
df,
!(is.na(df[,id_col])) & !(duplicated(df[,id_col])),
select = c(id_col, value_col)
)
return(sum(sbst[,value_col], na.rm = TRUE))
}
# Function to group data by ids: .grouping_func => function()
.grouping_func <- function(df, id_col){
r_l_e <- rle(df[,id_col])
rle_id <- rep(seq_along(r_l_e$values), times = r_l_e$lengths)
return(c(0, diff(rle_id)) != 0)
}
# Group the data: grpd_df => data.frame
grpd_df <- transform(
df,
grp = cumsum(
apply(
vapply(
names(df)[startsWith(names(df), "ID")],
function(x).grouping_func(df, x),
logical(nrow(df))
),
1,
all
)
)
)
# Split-apply-combine the aggregate function to the grouped data:
data.frame(do.call(rbind, lapply(with(grpd_df, split(grpd_df, grp)), function(s){
data.frame(
Seq = toString(s$Seq),
value_1 = .agg_func(s, "ID1", "value_1"),
value_2 = .agg_func(s, "ID2", "value_2")
)
}
)
), row.names = NULL, stringsAsFactors = FALSE
)
Upvotes: 1
Reputation: 683
Firstly, I really like AnilGoyal's solution. I can see that I need to start using the data.table package.
That being said, I was working on a dplyr approach sans data.table, which is conspicuously more verbose. Also, it took me awhile to figure out what to do with duplicate values. Multiplying by the changei
column (0 or 1) removed duplicates. The following was my approach:
df %>%
mutate_if(is.numeric, replace_na, 0) %>%
mutate_if(is.character, replace_na, "NA") %>%
mutate(
change1 = ID1 != lag(ID1, default = "Start"),
value_1 = value_1 * change1,
change2 = ID2 != lag(ID2, default = "Start"),
value_2 = value_2 * change2,
change = cumsum(change1 & change2)
) %>%
group_by(change) %>%
summarise(
Seq = toString(Seq),
value_1 = sum(value_1, na.rm = T),
value_2 = sum(value_2, na.rm = T)
) %>%
ungroup()
The result was:
df
# change Seq value_1 value_2
# <int> <chr> <dbl> <dbl>
# 1 1 9143, 916 30 0
# 2 2 9293 0 33
# 3 3 9301, 9302 30 246
# 4 4 9304, 9305, 9306, 9307, 931, 9311 80 104
Upvotes: 1
Reputation: 26218
I used rleid()
function from data.table package, which is a fascinating function to calculate run-length encoding. Do it like this
library(data.table)
library(dplyr)
df %>%
mutate(d = cumsum( c(0, diff(rleid(ID1))) != 0 & c(0, diff(rleid(ID2))) != 0),
value_1 = value_1 * c(1, diff(rleid(ID1))),
value_2 = value_2 * c(1, diff(rleid(ID2)))) %>% group_by(d) %>%
summarise(Seq = toString(Seq),
value_1 = sum(value_1, na.rm = T),
value_2 = sum(value_2, na.rm = T)) %>%
ungroup() %>% select(-d)
# A tibble: 4 x 3
Seq value_1 value_2
<chr> <int> <int>
1 9143, 916 30 0
2 9293 0 33
3 9301, 9302 30 246
4 9304, 9305, 9306, 9307, 931, 9311 80 104
Old answer
df %>% group_by(d = cumsum( c(0, diff(rleid(ID1))) != 0 & c(0, diff(rleid(ID2))) != 0)) %>%
summarise(Seq = toString(Seq),
value_1 = sum(unique(value_1), na.rm = T),
value_2 = sum(unique(value_2), na.rm = T)) %>%
ungroup() %>% select(-d)
Upvotes: 4