Reputation:
I have a dataset which looks like:
Experiment Sequence Parameter Time
Exp1 AAAA 2 10
Exp2 AAAA 2 11
Exp3 AAAA 2 12
Exp1 BBBB 2 13.1
Exp1 BBBB 3 13.2
Exp1 BBBB 4 13.3
Exp2 BBBB 2 14.1
Exp2 BBBB 3 14.2
Exp3 BBBB 2 16.3
Exp3 BBBB 3 16.4
Exp3 BBBB 4 16.5
Exp3 BBBB 5 16.6
Exp1 CCCC 2 20
Exp2 CCCC 2 22.2
Exp1 DDDD 3 22.3
Exp1 DDDD 2 22.4
Exp2 DDDD 3 25.2
Exp2 DDDD 2 25.3
Exp3 DDDD 3 27
Exp1 EEEE 2 28
Exp2 EEEE 3 29
Exp3 EEEE 4 30
Exp1 FFFF 2 33.2
Exp1 FFFF 3 33.4
Exp1 FFFF 4 33.6
Exp2 FFFF 2 35.1
Exp2 FFFF 3 35.2
Exp1 GGGG 2 40.1
Exp1 GGGG 2 40.2
Exp1 GGGG 2 40.3
Exp1 GGGG 2 42
Exp2 GGGG 2 42.3
Exp2 GGGG 2 44.3
Exp3 GGGG 2 45.3
Exp3 GGGG 2 45.4
The real dataset has:
For my analysis (on the time), I need at first to subset the dataframe depending on the combination of the first 3 columns: Experiment, Sequence and Parameter. Rules are:
So the dataframe in the example should become like this after subsetting:
Experiment Sequence Parameter Time
Exp1 AAAA 2 10
Exp2 AAAA 2 11
Exp3 AAAA 2 12
Exp1 BBBB 2 13.1
Exp2 BBBB 2 14.1
Exp3 BBBB 2 16.3
Exp1 BBBB 3 13.2
Exp2 BBBB 3 14.2
Exp3 BBBB 3 16.4
Exp1 DDDD 3 22.3
Exp2 DDDD 3 25.2
Exp3 DDDD 3 27
Can someone help me? Thank you!
Experiment <- c("Exp1", "Exp2", "Exp3", "Exp1", "Exp1", "Exp1", "Exp2", "Exp2", "Exp3", "Exp3", "Exp3", "Exp3", "Exp1", "Exp2", "Exp1", "Exp1", "Exp2", "Exp2", "Exp3", "Exp1", "Exp2", "Exp3", "Exp1", "Exp1", "Exp1", "Exp2", "Exp2", "Exp1", "Exp1", "Exp1", "Exp1", "Exp2", "Exp2", "Exp3", "Exp3")
Sequence <- c("AAAA", "AAAA", "AAAA", "BBBB", "BBBB", "BBBB", "BBBB", "BBBB", "BBBB", "BBBB","BBBB", "BBBB", "CCCC", "CCCC", "DDDD", "DDDD", "DDDD", "DDDD", "DDDD", "EEEE", "EEEE", "EEEE", "FFFF", "FFFF", "FFFF", "FFFF", "FFFF", "GGGG", "GGGG", "GGGG", "GGGG", "GGGG", "GGGG", "GGGG", "GGGG")
Parameter <- c("2", "2", "2", "2", "3", "4", "2", "3", "2", "3", "4", "5", "2", "2", "3", "2", "3", "2", "3", "2", "3", "4", "2", "3", "4", "2", "3", "2", "2", "2", "2", "2", "2", "2", "2")
Time <- c(10.0, 11.0, 12.0, 13.1, 13.2, 13.3, 14.1, 14.2, 16.3, 16.4, 16.5, 16.6, 20.0, 22.2, 22.3, 22.4, 25.2, 25.3, 27.0, 28.0, 29.0, 30.0, 33.2, 33.4, 33.6, 35.1, 35.2, 40.1, 40.2, 40.3, 42.0, 42.3, 44.3, 45.3, 45.4)
df <- data.frame(Experiment, Sequence, Parameter, Time)
Upvotes: 2
Views: 89
Reputation: 12559
this can do (my name for your dataframe is d
):
e <- length(unique(d$Experiment))
a <- aggregate(Experiment ~ Sequence+Parameter, data=d, FUN=function(x) length(unique(x)))
s1 <- merge(subset(a, Experiment==e)[1:2], d)
s1$n <- ave(s1$Parameter, FUN=length, s1$Sequence, s1$Parameter, s1$Experiment)
s2 <- subset(s1, n==1)
s1
satisfies the first condition, s2
additional the second.
the output for s2
is:
> s2
Sequence Parameter Experiment Time n
1 AAAA 2 Exp1 10.0 1
2 AAAA 2 Exp3 12.0 1
3 AAAA 2 Exp2 11.0 1
4 BBBB 2 Exp3 16.3 1
5 BBBB 2 Exp2 14.1 1
6 BBBB 2 Exp1 13.1 1
7 BBBB 3 Exp1 13.2 1
8 BBBB 3 Exp2 14.2 1
9 BBBB 3 Exp3 16.4 1
10 DDDD 3 Exp1 22.3 1
11 DDDD 3 Exp2 25.2 1
12 DDDD 3 Exp3 27.0 1
Here is the complete code:
d <- read.table(header=TRUE, text='Experiment Sequence Parameter Time
Exp1 AAAA 2 10
Exp2 AAAA 2 11
Exp3 AAAA 2 12
Exp1 BBBB 2 13.1
Exp1 BBBB 3 13.2
Exp1 BBBB 4 13.3
Exp2 BBBB 2 14.1
Exp2 BBBB 3 14.2
Exp3 BBBB 2 16.3
Exp3 BBBB 3 16.4
Exp3 BBBB 4 16.5
Exp3 BBBB 5 16.6
Exp1 CCCC 2 20
Exp2 CCCC 2 22.2
Exp1 DDDD 3 22.3
Exp1 DDDD 2 22.4
Exp2 DDDD 3 25.2
Exp2 DDDD 2 25.3
Exp3 DDDD 3 27
Exp1 EEEE 2 28
Exp2 EEEE 3 29
Exp3 EEEE 4 30
Exp1 FFFF 2 33.2
Exp1 FFFF 3 33.4
Exp1 FFFF 4 33.6
Exp2 FFFF 2 35.1
Exp2 FFFF 3 35.2
Exp1 GGGG 2 40.1
Exp1 GGGG 2 40.2
Exp1 GGGG 2 40.3
Exp1 GGGG 2 42
Exp2 GGGG 2 42.3
Exp2 GGGG 2 44.3
Exp3 GGGG 2 45.3
Exp3 GGGG 2 45.4')
e <- length(unique(d$Experiment))
a <- aggregate(Experiment ~ Sequence+Parameter, data=d, FUN=function(x) length(unique(x)))
s1 <- merge(subset(a, Experiment==e)[1:2], d)
s1$n <- ave(s1$Parameter, FUN=length, s1$Sequence, s1$Parameter, s1$Experiment)
s2 <- subset(s1, n==1)
The version for df
df <- data.frame(
Experiment= c("Exp1", "Exp2", "Exp3", "Exp1", "Exp1", "Exp1", "Exp2", "Exp2", "Exp3", "Exp3", "Exp3", "Exp3", "Exp1", "Exp2", "Exp1", "Exp1", "Exp2", "Exp2", "Exp3", "Exp1", "Exp2", "Exp3", "Exp1", "Exp1", "Exp1", "Exp2", "Exp2", "Exp1", "Exp1", "Exp1", "Exp1", "Exp2", "Exp2", "Exp3", "Exp3"),
Sequence= c("AAAA", "AAAA", "AAAA", "BBBB", "BBBB", "BBBB", "BBBB", "BBBB", "BBBB", "BBBB","BBBB", "BBBB", "CCCC", "CCCC", "DDDD", "DDDD", "DDDD", "DDDD", "DDDD", "EEEE", "EEEE", "EEEE", "FFFF", "FFFF", "FFFF", "FFFF", "FFFF", "GGGG", "GGGG", "GGGG", "GGGG", "GGGG", "GGGG", "GGGG", "GGGG"),
Parameter= c("2", "2", "2", "2", "3", "4", "2", "3", "2", "3", "4", "5", "2", "2", "3", "2", "3", "2", "3", "2", "3", "4", "2", "3", "4", "2", "3", "2", "2", "2", "2", "2", "2", "2", "2"),
Time= c("10.0", "11.0", "12.0", "13.1", "13.2", "13.3", "14.1", "14.2", "16.3", "16.4", "16.5", "16.6", "20.0", "22.2", "22.3", "22.4", "25.2", "25.3", "27.0", "28.0","29.0", "30.0", "33.2", "33.4", "33.6", "35.1", "35.2", "40.1", "40.2", "40.3","42.0", "42.3", "44.3", "45.3", "45.4")
)
df$eins <- 1
e <- length(unique(df$Experiment))
a <- aggregate(Experiment ~ Sequence+Parameter, data=df, FUN=function(x) length(unique(x)))
s1 <- merge(subset(a, Experiment==e)[1:2], df)
s1$n <- ave(s1$eins, FUN=length, s1$Sequence, s1$Parameter, s1$Experiment)
s2 <- subset(s1, n==1)
I suppose df$Time
should not be a factor.
Upvotes: 2
Reputation: 886948
One option is data.table
. We convert the 'data.frame' to 'data.table' (setDT(df)
, grouped by 'Sequence', 'Parameter', if
the frequency of unique
elements in 'Experiment' is 3, we Subset the Data.table (.SD
), then, we group by 'Experiment', 'Sequence', and 'Parameter' if the nrow is equal to 1 (.N==1
) we Subset the Data.table (.SD
).
library(data.table)
setDT(df)[, if(uniqueN(Experiment)==3) .SD, by = .(Sequence, Parameter)
][,if(.N ==1) .SD , by = .(Experiment,Sequence, Parameter)]
# Experiment Sequence Parameter Time
# 1: Exp1 AAAA 2 10.0
# 2: Exp2 AAAA 2 11.0
# 3: Exp3 AAAA 2 12.0
# 4: Exp1 BBBB 2 13.1
# 5: Exp2 BBBB 2 14.1
# 6: Exp3 BBBB 2 16.3
# 7: Exp1 BBBB 3 13.2
# 8: Exp2 BBBB 3 14.2
# 9: Exp3 BBBB 3 16.4
#10: Exp1 DDDD 3 22.3
#11: Exp2 DDDD 3 25.2
#12: Exp3 DDDD 3 27.0
Upvotes: 3
Reputation: 4686
require(dplyr)
# convenience variable aggregate of the two variables
df$seq.param <- with(df, paste(Sequence, Parameter))
# count number of unique experiments for each Sequence+Parameter
crosstab <- with(df, table(seq.param, Experiment))
seq.param.count <- rowSums(crosstab)
# identify those which appears for all experiment
selected <- names(seq.param.count[seq.param.count==length(unique(df$Experiment))])
dfs <- df %>%
filter(seq.param %in% selected) %>% # keep just all those appear in all
group_by(Experiment, Sequence, Parameter) %>%
summarize(count=n(), Time=Time[1]) %>% # count how many times in each experiment
filter(count==1) %>% # keep just those that appears once
select(-count) # remove extra variable
The output (displayed in the sorting you showed in your example)
> dfs[order(dfs$Sequence, dfs$Parameter),]
Source: local data frame [12 x 5]
Groups: Experiment, Sequence [9]
Experiment Sequence Parameter count Time
(fctr) (fctr) (fctr) (int) (fctr)
1 Exp1 AAAA 2 1 10.0
2 Exp2 AAAA 2 1 11.0
3 Exp3 AAAA 2 1 12.0
4 Exp1 BBBB 2 1 13.1
5 Exp2 BBBB 2 1 14.1
6 Exp3 BBBB 2 1 16.3
7 Exp1 BBBB 3 1 13.2
8 Exp2 BBBB 3 1 14.2
9 Exp3 BBBB 3 1 16.4
10 Exp1 DDDD 3 1 22.3
11 Exp2 DDDD 3 1 25.2
12 Exp3 DDDD 3 1 27.0
Upvotes: 2