Reputation: 3207
Following up on this question, I am trying to add one more layer of difficulty.
I have a data.frame
that looks like this:
> set.seed(123)
> mydf <- data.frame(Marker=rep(c('M1','M2'),each=15),
+ Patient=rep(rep(c('P1','P2','P3'),each=5),2),
+ Value=sample(1:1000, 30, replace = F))
> mydf
Marker Patient Value
1 M1 P1 288
2 M1 P1 788
3 M1 P1 409
4 M1 P1 881
5 M1 P1 937
6 M1 P2 46
7 M1 P2 525
8 M1 P2 887
9 M1 P2 548
10 M1 P2 453
11 M1 P3 948
12 M1 P3 449
13 M1 P3 670
14 M1 P3 566
15 M1 P3 102
16 M2 P1 993
17 M2 P1 243
18 M2 P1 42
19 M2 P1 323
20 M2 P1 996
21 M2 P2 872
22 M2 P2 679
23 M2 P2 627
24 M2 P2 972
25 M2 P2 640
26 M2 P3 691
27 M2 P3 530
28 M2 P3 579
29 M2 P3 282
30 M2 P3 143
What I want to do, is to run t.test
for each Patient combination (my grouping variable), on a Marker basis (my ID variable).
Based on one answer to the related question above, I know how to do it for one Marker at a time.
I can subset mydf
and do the following:
> params_list <- utils::combn(levels(mydf$Patient), 2, FUN = list)
> mydf0 <- subset(mydf, Marker=="M1")
> model_t <- purrr::map(.x = params_list,
+ .f = ~ t.test(formula = Value ~ Patient,
+ data = subset(mydf0, Patient %in% .x)))
> t_pvals <- purrr::map_dbl(.x = model_t, .f = "p.value")
> names(t_pvals) <- purrr::map_chr(.x = params_list, .f = ~ paste0(.x, collapse = "-vs-"))
> t_pvals
P1-vs-P2 P1-vs-P3 P2-vs-P3
0.3945742 0.5678729 0.7820905
Now I want to do it for all Markers in mydf
in an elegant way, and I chose data.table
.
I try the following, but I cannot reproduce the above pvalue results for Marker M1.
> group1 <- unlist(lapply(params_list, '[', 1))
> group2 <- unlist(lapply(params_list, '[', 2))
> mydt <- data.table::data.table(mydf)
> results_df <- as.data.frame(mydt[, list(group1= unlist(lapply(params_list, '[', 1)),
+ group2= unlist(lapply(params_list, '[', 2)),
+ pvalue= purrr::map_dbl(.x = purrr::map(.x = params_list,
+ .f = ~ stats::t.test(formula = Value ~ Patient, paired=FALSE,
+ data = subset(mydt, Patient %in% .x))), .f = "p.value") ),
+ by=list(Marker=Marker)])
> results_df
Marker group1 group2 pvalue
1 M1 P1 P2 0.8092365
2 M1 P1 P3 0.5156313
3 M1 P2 P3 0.2879954
4 M2 P1 P2 0.8092365
5 M2 P1 P3 0.5156313
6 M2 P2 P3 0.2879954
The structure of results_df
is exactly as I want it, but the pvalues are clearly wrong. They are not the same as the ones in the test above for M1, and they are identical for M1 and M2, meaning the same data subset is used in both cases.
I figured I should subset for each Marker as well in the subset
command, so I did this instead:
> markers_list <- as.list(levels(mydf$Marker))
> mydt <- data.table::data.table(mydf)
> results_df <- as.data.frame(mydt[, list(group1= unlist(lapply(params_list, '[', 1)),
+ group2= unlist(lapply(params_list, '[', 2)),
+ pvalue= purrr::map_dbl(.x = purrr::map(.x = params_list, .y = markers_list,
+ .f = ~ stats::t.test(formula = Value ~ Patient, paired=FALSE,
+ data = subset(mydt, Patient %in% .x & Marker==.y))), .f = "p.value") ),
+ by=list(Marker=Marker)])
> results_df
Marker group1 group2 pvalue
1 M1 P1 P2 0.7337355
2 M1 P1 P3 0.6930669
3 M1 P2 P3 0.3788015
4 M2 P1 P2 0.7337355
5 M2 P1 P3 0.6930669
6 M2 P2 P3 0.3788015
I thought this would be it, but still I'm getting incorrect pvalues, and identical for both M1 and M2 (same data subset still being used for both)...
So now I'm clueless... What am I doing wrong here? What would be the way to do it?
Thanks!
Upvotes: 2
Views: 244
Reputation: 34501
Using pairwise.t.test()
on the data grouped by Marker
seems a better way to approach this and avoids the need to explicitly generate the Patient
combinations.
library(dplyr)
library(tidyr)
mydf %>%
group_by(Marker) %>%
summarise(x = list(pairwise.t.test(Value, Patient, p.adjust.method = "none", pool.sd = FALSE)$p.value %>% as.data.frame.table(responseName = "p.value"))) %>%
unnest(x) %>%
filter(!is.na(p.value))
# A tibble: 6 x 4
Marker Var1 Var2 p.value
<fct> <fct> <fct> <dbl>
1 M1 P2 P1 0.395
2 M1 P3 P1 0.568
3 M1 P3 P2 0.782
4 M2 P2 P1 0.310
5 M2 P3 P1 0.751
6 M2 P3 P2 0.0373
In response to your comment, there is also a pairwise version of the wilcox test:
mydf %>%
group_by(Marker) %>%
summarise(x = list(pairwise.wilcox.test(Value, Patient, p.adjust.method = "none")$p.value %>% as.data.frame.table(responseName = "p.value"))) %>%
unnest(x) %>%
filter(!is.na(p.value))
# A tibble: 6 x 4
Marker Var1 Var2 p.value
<fct> <fct> <fct> <dbl>
1 M1 P2 P1 0.690
2 M1 P3 P1 0.841
3 M1 P3 P2 0.690
4 M2 P2 P1 0.690
5 M2 P3 P1 1
6 M2 P3 P2 0.0556
Upvotes: 1
Reputation: 389012
Here's one tidyverse
approach :
library(tidyverse)
get_p_value <- function(df) {
map_df(params_list, ~{
tibble(Marker = df[[1]][1], group1 = .x[1], group2 = .x[2],
pvalue = t.test(df$Value[df$Patient == .x[1]],
df$Value[df$Patient == .x[2]])$p.value)
})
}
mydf %>% group_split(Marker) %>% map_df(get_p_value)
# A tibble: 6 x 4
# Marker group1 group2 pvalue
# <fct> <chr> <chr> <dbl>
#1 M1 P1 P2 0.395
#2 M1 P1 P3 0.568
#3 M1 P2 P3 0.782
#4 M2 P1 P2 0.310
#5 M2 P1 P3 0.751
#6 M2 P2 P3 0.0373
where params_list
is from OP.
params_list <- utils::combn(levels(mydf$Patient), 2, FUN = list)
Upvotes: 1
Reputation: 25225
Another option in data.table
:
mydf[, rbindlist(combn(split(Value, Patient), 2L,
function(x) c(as.list(names(x)), .(t.test(x[[1]], x[[2]])$p.value)), simplify=FALSE))
, Marker]
output:
Marker V1 V2 V3
1: M1 P1 P2 0.3945742
2: M1 P1 P3 0.5678729
3: M1 P2 P3 0.7820905
4: M2 P1 P2 0.3098955
5: M2 P1 P3 0.7505371
6: M2 P2 P3 0.0372944
data:
library(data.table)
mydf <- fread("
Marker Patient Value
M1 P1 288
M1 P1 788
M1 P1 409
M1 P1 881
M1 P1 937
M1 P2 46
M1 P2 525
M1 P2 887
M1 P2 548
M1 P2 453
M1 P3 948
M1 P3 449
M1 P3 670
M1 P3 566
M1 P3 102
M2 P1 993
M2 P1 243
M2 P1 42
M2 P1 323
M2 P1 996
M2 P2 872
M2 P2 679
M2 P2 627
M2 P2 972
M2 P2 640
M2 P3 691
M2 P3 530
M2 P3 579
M2 P3 282
M2 P3 143")
Upvotes: 1
Reputation: 27732
Here is a data.table
-solution
I could not reproduce your sample data, so I read in the values provided using data.table::fread()
.
You could also use data.table::setDT(mydf)
on your existing mydf
to convert it to a data.table.
sample data
library(data.table)
#setDT(mydf)
mydf <- fread(" Marker Patient Value
M1 P1 288
M1 P1 788
M1 P1 409
M1 P1 881
M1 P1 937
M1 P2 46
M1 P2 525
M1 P2 887
M1 P2 548
M1 P2 453
M1 P3 948
M1 P3 449
M1 P3 670
M1 P3 566
M1 P3 102
M2 P1 993
M2 P1 243
M2 P1 42
M2 P1 323
M2 P1 996
M2 P2 872
M2 P2 679
M2 P2 627
M2 P2 972
M2 P2 640
M2 P3 691
M2 P3 530
M2 P3 579
M2 P3 282
M2 P3 143")
code
I added some brief explanation, and intermediate/temporary results in comments in the code. But it has become more comment than code ;-)...
Anyway, here we go...
mydf[,
#suppress immediate output using {}
{
# find all unique combinations of 2 patients (by Marker, see last line)
# For Marker == "M1", this looks like:
# V1 V2
# 1: P1 P2
# 2: P1 P3
# 3: P2 P3
patientcomb <- data.table( t( combn( unique( Patient ), 2 ) ) )
#set column names for V1 and V2 of patientcomb, for better readable code
names( patientcomb ) <- c( "group1", "group2" )
#now, using the temporarily created patientcomb-data.table...
patientcomb[,
#... perform the t.test(), using the Values from mydf,
# where the patients match group1/group1
#remember, we are still grouped by Marker
data.table( p.value = t.test( Value[Patient == group1],
Value[Patient == group2])$p.value),
#group by group1 and group2
by = .(group1, group2) ]
# for Marker == M1, this looks like:
# group1 group2 p.value
# 1: P1 P2 0.3945742
# 2: P1 P3 0.5678729
# 3: P2 P3 0.7820905
# for Marker == M2, this looks like:
# group1 group2 p.value
# 1: P1 P2 0.3098955
# 2: P1 P3 0.7505371
# 3: P2 P3 0.0372944
},
#main grouping by Marker
by = .(Marker) ]
output
seems to match desired output
# Marker group1 group2 p.value
# 1: M1 P1 P2 0.3945742
# 2: M1 P1 P3 0.5678729
# 3: M1 P2 P3 0.7820905
# 4: M2 P1 P2 0.3098955
# 5: M2 P1 P3 0.7505371
# 6: M2 P2 P3 0.0372944
Upvotes: 3