Reputation: 879
here is a dataframe:
cluster_names Species values Nsp Nsp_MRCA Event NB_Event Nsp_losses
1 Group1 Sp1 1 3 3 1 2 0
2 Group1 Sp1 4 3 3 1 2 0
3 Group1 Sp2 78 NA NA 1 2 NA
4 Group1 Sp3 NA 3 12 2 2 9
5 Group1 Sp4 NA 3 3 2 2 0
6 Group2 Sp2 3 2 3 2 2 1
7 Group2 Sp3 9 2 40 2 2 38
8 Group2 Sp4 8 NA NA 2 2 NA
9 Group3 Sp1 9 2 2 1 1 0
10 Group3 Sp3 10 3 3 1 1 0
11 Group3 Sp3 12 3 20 1 1 17
12 Group3 Sp3 14 2 3 1 1 1
13 Group4 Sp4 23 3 112 1 1 109
14 Group5 Sp3 34 5 114 1 1 109
15 Group6 Sp4 2 3 3 1 1 0
How can I say with dplyr
, keep only Groups
where :
Nsp > 1
at least for one row
Nsp == Nsp_MRCA
at least for one rowNsp_losses < 3
exept if all the Nsp are between 5 and 2 and all the Nsp_losses < 20
NB_Event
has to be < 3
Here with such filter I should get a new df :
cluster_names Species values Nsp Nsp_MRCA Event NB_Event Nsp_losses
1 Group1 Sp1 1 3 3 1 2 0
2 Group1 Sp1 4 3 3 1 2 0
3 Group1 Sp2 78 NA NA 1 2 NA
4 Group1 Sp3 NA 3 12 2 2 9
5 Group1 Sp4 NA 3 3 2 2 0
9 Group3 Sp1 9 2 2 1 1 0
10 Group3 Sp3 10 3 3 1 1 0
11 Group3 Sp3 12 3 20 1 1 17
12 Group3 Sp3 14 2 3 1 1 1
15 Group6 Sp4 2 3 3 1 1 0
Detail:
Group1
is kept because Nsp
are between 5
and 2
and all the Nsp_losses < 20
Group2
is removed because Nsp_losses = 38
Group3
is kept because Nsp
are between 5
and 2
and all the Nsp_losses < 20
Groups 4
and 5
are removed because Nsp_losses = 38
Group6
is kept because Nsp == Nsp_MRCA
at least for one rowand all those ones have at least one row with a Nsp > 1
So far I tried the following code:
tab %>%
group_by(cluster_names) %>%
mutate(NB_Event = max(Event,na.rm=TRUE)) %>%
filter(any(Nsp > 1 |is.na(Nsp))) %>%
filter(any(Nsp == Nsp_MRCA)) %>%
mutate(Nsp_losses = abs(Nsp - Nsp_MRCA)) %>%
filter(all(Nsp <=5 |is.na(Nsp)) & all(Nsp > 1 |is.na(Nsp) & all(Nsp_losses < 20 |is.na(Nsp_losses)))) %>%
Here is the dataframe
structure(list(cluster_names = structure(c(1L, 1L, 1L, 1L, 1L,
2L, 2L, 2L, 3L, 3L, 3L, 3L, 4L, 5L, 6L), .Label = c("Group1",
"Group2", "Group3", "Group4", "Group5", "Group6"), class = "factor"),
Species = structure(c(1L, 1L, 2L, 3L, 4L, 2L, 3L, 4L, 1L,
3L, 3L, 3L, 4L, 3L, 4L), .Label = c("Sp1", "Sp2", "Sp3",
"Sp4"), class = "factor"), values = c(1L, 4L, 78L, NA, NA,
3L, 9L, 8L, 9L, 10L, 12L, 14L, 23L, 34L, 2L), Nsp = c(3L,
3L, NA, 3L, 3L, 2L, 2L, NA, 2L, 3L, 3L, 2L, 3L, 5L, 3L),
Nsp_MRCA = c(3L, 3L, NA, 12L, 3L, 3L, 40L, NA, 2L, 3L, 20L,
3L, 112L, 114L, 3L), Event = c(1L, 1L, 1L, 2L, 2L, 2L, 2L,
2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L)), class = "data.frame", row.names = c(NA,
-15L))
Thank you for your help and time.
Upvotes: 1
Views: 393
Reputation: 13309
We could do:
tab %>%
group_by(cluster_names) %>%
mutate(Nsp_losses = abs(Nsp - Nsp_MRCA),
Cond=ifelse(Nsp_losses < 20 & between(Nsp,2,5) || Nsp==Nsp_MRCA ,1,0)) %>%
filter(Cond==1) %>%
filter(all(Nsp_losses)<20) %>%
select(-Cond)
cluster_names Species values Nsp Nsp_MRCA Event Nsp_losses
<fct> <fct> <int> <int> <int> <int> <int>
1 Group1 Sp1 1 3 3 1 0
2 Group1 Sp1 4 3 3 1 0
3 Group1 Sp2 78 NA NA 1 NA
4 Group1 Sp3 NA 3 12 2 9
5 Group1 Sp4 NA 3 3 2 0
6 Group3 Sp1 9 2 2 1 0
7 Group3 Sp3 10 3 3 1 0
8 Group3 Sp3 12 3 20 1 17
9 Group3 Sp3 14 2 3 1 1
10 Group6 Sp4 2 3 3 1 0
Upvotes: 2
Reputation: 4520
Assuming you already have NB_Event
and Nsp_losses
vars, and recreating your text line-by-line:
library(tidyverse)
dat %>%
group_by(cluster_names) %>%
filter(
any(Nsp > 1, na.rm = T) &
any(Nsp == Nsp_MRCA, na.rm = T) &
all(NB_Event < 3, na.rm = T) &
all(Nsp_losses < 3, na.rm = T) |
all(
between(na.omit(Nsp), 2, 5) &
all(Nsp_losses < 20, na.rm = T)
)
) %>%
ungroup()
Which outputs:
# A tibble: 10 x 8
cluster_names Species values Nsp Nsp_MRCA Event NB_Event Nsp_losses
<fct> <fct> <int> <int> <int> <int> <dbl> <int>
1 Group1 Sp1 1 3 3 1 2 0
2 Group1 Sp1 4 3 3 1 2 0
3 Group1 Sp2 78 NA NA 1 2 NA
4 Group1 Sp3 NA 3 12 2 2 9
5 Group1 Sp4 NA 3 3 2 2 0
6 Group3 Sp1 9 2 2 1 2 0
7 Group3 Sp3 10 3 3 1 2 0
8 Group3 Sp3 12 3 20 1 2 17
9 Group3 Sp3 14 2 3 1 2 1
10 Group6 Sp4 2 3 3 1 2 0
Upvotes: 1
Reputation: 388807
The presence of NA
s makes it a bit tricky hence, I remove them first using na.omit
and find out groups (cluster_names
) which satisfy the conditions given and later filter
based on that.
library(dplyr)
tab %>%
filter(cluster_names %in% (tab %>%
na.omit() %>%
mutate(Nsp_losses = abs(Nsp - Nsp_MRCA)) %>%
group_by(cluster_names) %>%
filter(any(Nsp > 1 & Nsp == Nsp_MRCA) & all(Event < 3) &
(if(all(Nsp %in% 2:5)) all(Nsp_losses < 20) else all(Nsp_losses < 3))) %>%
pull(cluster_names) %>% unique))
# cluster_names Species values Nsp Nsp_MRCA Event
#1 Group1 Sp1 1 3 3 1
#2 Group1 Sp1 4 3 3 1
#3 Group1 Sp2 78 NA NA 1
#4 Group1 Sp3 NA 3 12 2
#5 Group1 Sp4 NA 3 3 2
#6 Group3 Sp1 9 2 2 1
#7 Group3 Sp3 10 3 3 1
#8 Group3 Sp3 12 3 20 1
#9 Group3 Sp3 14 2 3 1
#10 Group6 Sp4 2 3 3 1
Upvotes: 2