user2823398
user2823398

Reputation:

R subsetting dataframe based on the combination of 3 columns and excluding duplicate combinations

I have a dataset which looks like:

Experiment  Sequence    Parameter   Time
Exp1    AAAA    2   10
Exp2    AAAA    2   11
Exp3    AAAA    2   12
Exp1    BBBB    2   13.1
Exp1    BBBB    3   13.2
Exp1    BBBB    4   13.3
Exp2    BBBB    2   14.1
Exp2    BBBB    3   14.2
Exp3    BBBB    2   16.3
Exp3    BBBB    3   16.4
Exp3    BBBB    4   16.5
Exp3    BBBB    5   16.6
Exp1    CCCC    2   20
Exp2    CCCC    2   22.2
Exp1    DDDD    3   22.3
Exp1    DDDD    2   22.4
Exp2    DDDD    3   25.2
Exp2    DDDD    2   25.3
Exp3    DDDD    3   27
Exp1    EEEE    2   28
Exp2    EEEE    3   29
Exp3    EEEE    4   30
Exp1    FFFF    2   33.2
Exp1    FFFF    3   33.4
Exp1    FFFF    4   33.6
Exp2    FFFF    2   35.1
Exp2    FFFF    3   35.2
Exp1    GGGG    2   40.1
Exp1    GGGG    2   40.2
Exp1    GGGG    2   40.3
Exp1    GGGG    2   42
Exp2    GGGG    2   42.3
Exp2    GGGG    2   44.3
Exp3    GGGG    2   45.3
Exp3    GGGG    2   45.4

The real dataset has:

For my analysis (on the time), I need at first to subset the dataframe depending on the combination of the first 3 columns: Experiment, Sequence and Parameter. Rules are:

So the dataframe in the example should become like this after subsetting:

Experiment  Sequence    Parameter   Time
Exp1    AAAA    2   10
Exp2    AAAA    2   11
Exp3    AAAA    2   12

Exp1    BBBB    2   13.1
Exp2    BBBB    2   14.1
Exp3    BBBB    2   16.3

Exp1    BBBB    3   13.2
Exp2    BBBB    3   14.2
Exp3    BBBB    3   16.4

Exp1    DDDD    3   22.3
Exp2    DDDD    3   25.2
Exp3    DDDD    3   27

Can someone help me? Thank you!

Experiment <- c("Exp1", "Exp2", "Exp3", "Exp1", "Exp1", "Exp1", "Exp2", "Exp2", "Exp3", "Exp3", "Exp3", "Exp3", "Exp1", "Exp2", "Exp1", "Exp1", "Exp2", "Exp2", "Exp3", "Exp1", "Exp2", "Exp3", "Exp1", "Exp1", "Exp1", "Exp2", "Exp2", "Exp1", "Exp1", "Exp1",   "Exp1", "Exp2", "Exp2", "Exp3", "Exp3")
Sequence <- c("AAAA", "AAAA", "AAAA", "BBBB", "BBBB", "BBBB", "BBBB", "BBBB", "BBBB", "BBBB","BBBB", "BBBB", "CCCC", "CCCC", "DDDD", "DDDD", "DDDD", "DDDD", "DDDD", "EEEE", "EEEE", "EEEE", "FFFF", "FFFF", "FFFF", "FFFF", "FFFF", "GGGG", "GGGG", "GGGG", "GGGG", "GGGG", "GGGG", "GGGG", "GGGG")
Parameter <- c("2", "2", "2", "2", "3", "4", "2", "3", "2", "3", "4", "5", "2", "2", "3", "2", "3", "2", "3", "2", "3", "4", "2", "3", "4", "2", "3", "2", "2", "2", "2", "2", "2", "2", "2")
Time <- c(10.0, 11.0, 12.0, 13.1, 13.2, 13.3, 14.1, 14.2, 16.3, 16.4, 16.5, 16.6, 20.0, 22.2, 22.3, 22.4, 25.2, 25.3, 27.0, 28.0, 29.0, 30.0, 33.2, 33.4, 33.6, 35.1, 35.2, 40.1, 40.2, 40.3, 42.0, 42.3, 44.3, 45.3, 45.4)
df <- data.frame(Experiment, Sequence, Parameter, Time)

Upvotes: 2

Views: 89

Answers (3)

jogo
jogo

Reputation: 12559

this can do (my name for your dataframe is d):

e <- length(unique(d$Experiment))
a <- aggregate(Experiment ~ Sequence+Parameter, data=d, FUN=function(x) length(unique(x)))
s1 <- merge(subset(a, Experiment==e)[1:2], d)
s1$n <- ave(s1$Parameter, FUN=length, s1$Sequence, s1$Parameter, s1$Experiment)
s2 <- subset(s1, n==1)

s1 satisfies the first condition, s2 additional the second.

the output for s2 is:

> s2
   Sequence Parameter Experiment Time n
1      AAAA         2       Exp1 10.0 1
2      AAAA         2       Exp3 12.0 1
3      AAAA         2       Exp2 11.0 1
4      BBBB         2       Exp3 16.3 1
5      BBBB         2       Exp2 14.1 1
6      BBBB         2       Exp1 13.1 1
7      BBBB         3       Exp1 13.2 1
8      BBBB         3       Exp2 14.2 1
9      BBBB         3       Exp3 16.4 1
10     DDDD         3       Exp1 22.3 1
11     DDDD         3       Exp2 25.2 1
12     DDDD         3       Exp3 27.0 1

Here is the complete code:

d <- read.table(header=TRUE, text='Experiment  Sequence    Parameter   Time
Exp1    AAAA    2   10
Exp2    AAAA    2   11
Exp3    AAAA    2   12
Exp1    BBBB    2   13.1
Exp1    BBBB    3   13.2
Exp1    BBBB    4   13.3
Exp2    BBBB    2   14.1
Exp2    BBBB    3   14.2
Exp3    BBBB    2   16.3
Exp3    BBBB    3   16.4
Exp3    BBBB    4   16.5
Exp3    BBBB    5   16.6
Exp1    CCCC    2   20
Exp2    CCCC    2   22.2
Exp1    DDDD    3   22.3
Exp1    DDDD    2   22.4
Exp2    DDDD    3   25.2
Exp2    DDDD    2   25.3
Exp3    DDDD    3   27
Exp1    EEEE    2   28
Exp2    EEEE    3   29
Exp3    EEEE    4   30
Exp1    FFFF    2   33.2
Exp1    FFFF    3   33.4
Exp1    FFFF    4   33.6
Exp2    FFFF    2   35.1
Exp2    FFFF    3   35.2
Exp1    GGGG    2   40.1
Exp1    GGGG    2   40.2
Exp1    GGGG    2   40.3
Exp1    GGGG    2   42
Exp2    GGGG    2   42.3
Exp2    GGGG    2   44.3
Exp3    GGGG    2   45.3
Exp3    GGGG    2   45.4')
e <- length(unique(d$Experiment))
a <- aggregate(Experiment ~ Sequence+Parameter, data=d, FUN=function(x) length(unique(x)))
s1 <- merge(subset(a, Experiment==e)[1:2], d)
s1$n <- ave(s1$Parameter, FUN=length, s1$Sequence, s1$Parameter, s1$Experiment)
s2 <- subset(s1, n==1)

The version for df

df <- data.frame(
Experiment= c("Exp1", "Exp2", "Exp3", "Exp1", "Exp1", "Exp1", "Exp2", "Exp2", "Exp3", "Exp3", "Exp3", "Exp3", "Exp1", "Exp2", "Exp1", "Exp1", "Exp2", "Exp2", "Exp3", "Exp1", "Exp2", "Exp3", "Exp1", "Exp1", "Exp1", "Exp2", "Exp2", "Exp1", "Exp1", "Exp1",   "Exp1", "Exp2", "Exp2", "Exp3", "Exp3"),
Sequence= c("AAAA", "AAAA", "AAAA", "BBBB", "BBBB", "BBBB", "BBBB", "BBBB", "BBBB", "BBBB","BBBB", "BBBB", "CCCC", "CCCC", "DDDD", "DDDD", "DDDD", "DDDD", "DDDD", "EEEE", "EEEE", "EEEE", "FFFF", "FFFF", "FFFF", "FFFF", "FFFF", "GGGG", "GGGG", "GGGG", "GGGG", "GGGG", "GGGG", "GGGG", "GGGG"),
Parameter= c("2", "2", "2", "2", "3", "4", "2", "3", "2", "3", "4", "5", "2", "2", "3", "2", "3", "2", "3", "2", "3", "4", "2", "3", "4", "2", "3", "2", "2", "2", "2", "2", "2", "2", "2"),
Time= c("10.0", "11.0", "12.0", "13.1", "13.2", "13.3", "14.1", "14.2", "16.3", "16.4", "16.5", "16.6", "20.0", "22.2", "22.3", "22.4", "25.2", "25.3", "27.0", "28.0","29.0", "30.0", "33.2", "33.4", "33.6", "35.1", "35.2", "40.1", "40.2", "40.3","42.0", "42.3", "44.3", "45.3", "45.4")
)
df$eins <- 1
e <- length(unique(df$Experiment))
a <- aggregate(Experiment ~ Sequence+Parameter, data=df, FUN=function(x) length(unique(x)))
s1 <- merge(subset(a, Experiment==e)[1:2], df)
s1$n <- ave(s1$eins, FUN=length, s1$Sequence, s1$Parameter, s1$Experiment)
s2 <- subset(s1, n==1)

I suppose df$Time should not be a factor.

Upvotes: 2

akrun
akrun

Reputation: 886948

One option is data.table. We convert the 'data.frame' to 'data.table' (setDT(df), grouped by 'Sequence', 'Parameter', if the frequency of unique elements in 'Experiment' is 3, we Subset the Data.table (.SD), then, we group by 'Experiment', 'Sequence', and 'Parameter' if the nrow is equal to 1 (.N==1) we Subset the Data.table (.SD).

library(data.table)
setDT(df)[, if(uniqueN(Experiment)==3) .SD, by = .(Sequence, Parameter)
        ][,if(.N ==1) .SD , by = .(Experiment,Sequence, Parameter)]
#      Experiment Sequence Parameter Time
# 1:       Exp1     AAAA         2 10.0
# 2:       Exp2     AAAA         2 11.0
# 3:       Exp3     AAAA         2 12.0
# 4:       Exp1     BBBB         2 13.1
# 5:       Exp2     BBBB         2 14.1
# 6:       Exp3     BBBB         2 16.3
# 7:       Exp1     BBBB         3 13.2
# 8:       Exp2     BBBB         3 14.2
# 9:       Exp3     BBBB         3 16.4
#10:       Exp1     DDDD         3 22.3
#11:       Exp2     DDDD         3 25.2
#12:       Exp3     DDDD         3 27.0

Upvotes: 3

Ricky
Ricky

Reputation: 4686

require(dplyr)

# convenience variable aggregate of the two variables
df$seq.param <- with(df, paste(Sequence, Parameter))

# count number of unique experiments for each Sequence+Parameter
crosstab <- with(df, table(seq.param, Experiment))
seq.param.count <- rowSums(crosstab)

# identify those which appears for all experiment
selected <- names(seq.param.count[seq.param.count==length(unique(df$Experiment))])

dfs <- df %>%
  filter(seq.param %in% selected) %>%   # keep just all those appear in all
  group_by(Experiment, Sequence, Parameter) %>%
  summarize(count=n(), Time=Time[1]) %>%  # count how many times in each experiment
  filter(count==1) %>%   # keep just those that appears once
  select(-count)   # remove extra variable

The output (displayed in the sorting you showed in your example)

> dfs[order(dfs$Sequence, dfs$Parameter),]
Source: local data frame [12 x 5]
Groups: Experiment, Sequence [9]

   Experiment Sequence Parameter count   Time
       (fctr)   (fctr)    (fctr) (int) (fctr)
1        Exp1     AAAA         2     1   10.0
2        Exp2     AAAA         2     1   11.0
3        Exp3     AAAA         2     1   12.0
4        Exp1     BBBB         2     1   13.1
5        Exp2     BBBB         2     1   14.1
6        Exp3     BBBB         2     1   16.3
7        Exp1     BBBB         3     1   13.2
8        Exp2     BBBB         3     1   14.2
9        Exp3     BBBB         3     1   16.4
10       Exp1     DDDD         3     1   22.3
11       Exp2     DDDD         3     1   25.2
12       Exp3     DDDD         3     1   27.0

Upvotes: 2

Related Questions