Reputation: 439
I'm trying to drop duplicates and keep the row with the maximum values. I can do this separately per strategy.
However, when trying to do this based on two separate conditionals based on the strategy, the dataframe tends to overwrite one another when trying to apply these.
This is needed given that one strategy contains values that one strategy has and another does not; note these do share one common column though.
ID strategy Common DNA_Col RNA_Col
1 ABA DNA 0.65 0.66 NA
2 ABB RNA 0.65 NA 0.15
3 ABB RNA 0.65 NA 0.12
4 ABC DNA 0.55 0.88 NA
5 ABC DNA 0.14 0.14 NA
6 ABC DNA 0.15 0.50 NA
7 ABD RNA 0.25 NA 0.12
ID strategy Common DNA_Col RNA_Col
1 ABA DNA 0.65 0.66 NA
2 ABB RNA 0.65 NA 0.15
3 ABC DNA 0.55 0.88 NA
4 ABD RNA 0.25 NA 0.12
Producing the Dataframe:
> df <- data.frame(
+ stringsAsFactors = FALSE,
+ ID = c("ABA", "ABB", "ABB", "ABC", "ABC", "ABC", "ABD"),
+ strategy =c("DNA", "RNA", "RNA", "DNA", "DNA", "DNA", "RNA"),
+ Common = c(0.65, 0.65, 0.65, 0.55, 0.14, 0.15, 0.25),
+ DNA_Col= c(0.66, NA, NA, 0.88, 0.14, 0.5, NA),
+ RNA_Col = c(NA, 0.15, 0.12, NA, NA, NA, 0.12)
+ )
Applying Conditionals
if (df$strategy == "RNA") {
df = df %>% group_by(id) %>% slice_max(RNA_Col, n=1) %>% ungroup
} else if (df$strategy == "DNA") {
df = df %>% group_by(df) %>% slice_max(DNA_Col, n=1) %>% ungroup
}
Upvotes: 2
Views: 232
Reputation: 25483
A possible solution:
library(tidyverse)
df <- data.frame(
stringsAsFactors = FALSE,
ID = c("ABA", "ABB", "ABB", "ABC", "ABC", "ABC", "ABD"),
strategy = c("DNA", "RNA", "RNA", "DNA", "DNA", "DNA", "RNA"),
Common = c(0.65, 0.65, 0.65, 0.55, 0.14, 0.15, 0.25),
DNA_Col = c(0.66, NA, NA, 0.88, 0.14, 0.5, NA),
RNA_Col = c(NA, 0.15, 0.12, NA, NA, NA, 0.12)
)
df %>%
group_by(ID) %>%
slice_max(DNA_Col, n = 1) %>%
ungroup %>%
bind_rows(
df %>%
group_by(ID) %>%
slice_max(RNA_Col, n = 1) %>%
ungroup)
#> # A tibble: 4 × 5
#> ID strategy Common DNA_Col RNA_Col
#> <chr> <chr> <dbl> <dbl> <dbl>
#> 1 ABA DNA 0.65 0.66 NA
#> 2 ABC DNA 0.55 0.88 NA
#> 3 ABB RNA 0.65 NA 0.15
#> 4 ABD RNA 0.25 NA 0.12
Upvotes: 2
Reputation: 208
This can be done by using the pivot_longer()
function to bring the values from the RNA_Col
and DNA_Col
variables into one single column to be handled simultaneously, this column can then be used to repopulate the columns using ifelse()
.
library(tidyverse)
df <- data.frame(
stringsAsFactors = FALSE,
ID = c("ABA", "ABB", "ABB", "ABC", "ABC", "ABC", "ABD"),
strategy =c("DNA", "RNA", "RNA", "DNA", "DNA", "DNA", "RNA"),
Common = c(0.65, 0.65, 0.65, 0.55, 0.14, 0.15, 0.25),
DNA_Col= c(0.66, NA, NA, 0.88, 0.14, 0.5, NA),
RNA_Col = c(NA, 0.15, 0.12, NA, NA, NA, 0.12)
)
df %>%
pivot_longer(cols = c(DNA_Col, RNA_Col),
names_to = "Original_Col",
values_to = "Value") %>%
group_by(ID) %>%
slice_max(Value, n = 1) %>%
ungroup() %>%
mutate(DNA_Col = ifelse(
Original_Col == "DNA_Col", Value, NA
),
RNA_Col = ifelse(
Original_Col == "RNA_Col", Value, NA
)) %>%
select(ID, strategy, Common, DNA_Col, RNA_Col)
# A tibble: 4 × 5
ID strategy Common DNA_Col RNA_Col
<chr> <chr> <dbl> <dbl> <dbl>
1 ABA DNA 0.65 0.66 NA
2 ABB RNA 0.65 NA 0.15
3 ABC DNA 0.55 0.88 NA
4 ABD RNA 0.25 NA 0.12
Upvotes: 2