Reputation: 21
I'm currently having to subset data in multiple ways to extract the data I need to do a correlation and identify if there is a relationship between the same metals of the groups.
Group1 <- subset(Data_Set, subset = Data_Set$Sample == "1")
Group1A <- subset(Group1, subset = Group1$Sample_Type == "A")
GroupX <- subset(Data_Set, subset = Data_Set$Sample == "X")
GroupX<- subset(GroupX, Sample_ID %in% Group1A$Sample_ID )
cor.test(Group1A$Pb,GroupX$Pb, method = "kendall")
ID is used to match between groups
However, this is very inefficient. I have in total 6 groups which have subgroups (between 1-6) composed of 1-40 samples and I looking to see if there is a relationship between any of these groups and GroupX. Is there a function to speed this up.
The output would be something along the lines of:
Pb T p-value tau
Group1A~GroupX 340 0.001 0.5902
Group1B~GroupX 435 0.03 0.2344
.....
Group6C~GroupX 344 0.001 0.4566
And this would be repeated for 5 other metals
I was thinking a standard correlation matrix but this does correlations between metals within a group.
Thanks!
EDIT: Sample data as requested
structure(list(Sample = c("2", "2", "2", "2", "X", "2", "2",
"2", "2", "2", "2", "X", "2", "2", "5", "5", "5", "5", "5", "X",
"5", "5", "3", "3", "X", "3", "3", "X", "4", "4"), Sample_ID = c("DC001",
"DC001", "DC001", "DC001", "DC001", "DC001", "DC001", "DC002",
"DC002", "DC002", "DC002", "DC002", "DC002", "DC002", "DC003",
"DC003", "DC003", "DC003", "DC003", "DC003", "DC003", "DC003",
"DC004", "DC004", "DC004", "DC005", "DC005", "DC005", "DC006",
"DC006"), Sample_Type = c("A", "D", "E", "F", "X", "I", "J",
"A", "D", "E", "F", "X", "I", "J", "A", "B", "D", "E", "F", "X",
"I", "J", "C", "F", "X", "C", "F", "X", "A", "D"), Co = c(0,
0.204473214269861, 0, 0.50977856054987, 0.262230521160956, 0,
0, 0, 0, 0, 0.465855303428853, 0.229502158969648, 0.214970121592712,
0.588126362402572, 0, 0.0906122639531158, 0.229838105464066,
0, 0.240533898070871, 4.77802122014029, 0.47537095149254, 0.384495379166814,
0.00135414270258444, 0.458235177876183, 0.412977043885698, 0.187579567424379,
0.317854941692133, 0.0271598068567071, 0, 0.293328743450483),
Ni = c(2.32894078024542, 0, 2.75976812547636, 2.35251746719724,
0.351631195258774, 1.25476391714642, 0.0586626807902249,
0, 2.31716731851309, 0, 4.03426936736104, 0.414520597983989,
2.69897385721456, 0.781651988488391, 1.48260693680732, 1.59083944326126,
0.944038748319438, 3.06889126279262, 1.69552165261712, 0.849220149877567,
1.75387912556474, 0, 0.333762199305291, 1.66187141150986,
0.735834552887327, 3.72419677755011, 1.27862769479216, 0.264762516047524,
1.84288031704096, 1.8828793053893), Cu = c(16.6696573471153,
21.377014252538, 16.4581203986139, 6.49438237470201, 1.57054125960644,
5.67180974109468, 23.5835333332964, 38.6483288663375, 15.2589198442198,
21.9746392829346, 7.09307693625389, 0.967127488045321, 6.32542891436958,
16.1173426649179, 11.2222721930992, 8.42093833910001, 11.1332246071585,
16.7442343774396, 10.8140656299147, 14.2632807636599, 5.35502290473828,
7.29141216675894, 2.53789491234011, 16.5791995430022, 1.00648647764661,
26.6313784234462, 0.0413060789264422, 0.656674377606213,
3.98095036332964, 6.17760205144632), Zn = c(76.5281110975817,
2652.50181007495, 1007.00556337852, 206.99812727191, 640.15733114957,
484.221162531697, 3718.61286231799, 131.574098527507, 9826.49966864988,
1827.75831773692, 557.015412652748, 850.519284594127, 955.085171501707,
3039.23169926716, 117.947177178762, 65.7886442827721, 78.1092625035093,
253.691311074245, 980.544294923672, 506.400193234096, 1110.92409209043,
902.659801267825, 284.143460051779, 991.762202132739, 899.71040333897,
1686.99915717559, 27.0835877755038, 956.364728487396, 142.167067778216,
1012.61495002819), As = c(0, 0, 1.91185052013389, 1.32808264279786,
0.141039242323703, 1.74872331719823, 0.1065340816859, 0.812367854870543,
0, 0.797230094696634, 2.38925992872935, 0.305621793073037,
0.664951374730799, 0, 0, 0, 2.52051964809224, 0, 0, 0.392178178336116,
0, 3.08334159340895, 2.32108729394528, 1.62081021652742,
0.171200134084414, 6.19125023716284, 4.43213876523911, 0.289386770990403,
0.313331113399545, 6.41607755268465), Cd = c(8.22465741493669,
22.6126042664945, 34.0150873273517, 13.5844058876617, 5.22665850051452,
24.0465414683255, 109.478598702669, 15.1992477278811, 169.517190223851,
75.2983940524065, 34.5230481628261, 3.75297525105592, 45.6178498733986,
247.435132822196, 2.10793502840313, 1.47647473271431, 0.0848090794945706,
2.98717760781629, 3.13384011407655, 5.31936421369202, 3.73593799828465,
5.36310372449921, 0.298562637256625, 1.82673831232711, 3.78462211601718,
8.0628550389363, 0.138799690323038, 1.32275598609847, 0.285061500560821,
0.635235209786838), Pb = c(0.922803462498185, 5.13959353157866,
1.9525414480789, 0, 2.5902978681043, 1.21865949505257, 7.09067896476338,
0, 3.89524247237658, 0.354938950934777, 2.64634863087263,
0.356658949506862, 1.25701617111933, 4.18799241835111, 0,
0.807369345092201, 0.0263264119388502, 0, 3.32333444396018,
76.7555925603143, 0.613522400825461, 0, 1.72315815094652,
3.21414903849599, 1.03802696495681, 1.73176109371547, 0.72736174943572,
0.23309888503164, 12.8688959655249, 33.2486209089115)), row.names = c(NA,
-30L), class = c("tbl_df", "tbl", "data.frame"))
Upvotes: 1
Views: 123
Reputation: 3923
Here's a solution that get's you exactly what you want. It looks a little long and forbidding but I've tried to make it easy to follow and to expand and bullet it proof to missing observations, and NAs etc.. I also have a brute force solution that does **all* the correlations then removes all those you don't need.
library(dplyr)
library(stringr)
library(purrr)
library(broom)
# made up data set that is similar to yours but with missing rows and NAs
set.seed(2020)
Data_Set <-
data.frame(
Sample = c(rep("X", times = 10), rep("2", times = 20), "X", "2"),
Sample_ID = c(rep(c("DC001", "DC002", "DC003", "DC004", "DC005", "DC006", "DC007", "DC008", "DC009", "DC010"), times = 3), "DC011", "DC012"),
Sample_Type = c(rep("X", times = 10), rep("A", times = 10), rep("D", times = 10), "X", "A"),
Co = runif(32, 0, 5),
Ni = runif(32, 0, 4.1),
Cu = runif(32, 0, 39),
Zn = runif(32, 27, 9800),
As = runif(32, 0, 6),
Cd = runif(32, 0, 247),
Pb = runif(32, 0, 78)
)
Data_Set[15,5] <- NA
# Data_Set
# Collapse Sample and Sample_Type into one Group variable
Data_Set <-
Data_Set %>%
mutate(Group = str_c(Sample, Sample_Type)) %>%
select(Group, everything())
# Pull out Group XX (our baseline) and relabel
# the metals with an _X on the end
JustGroupX <-
Data_Set %>%
filter(Group == "XX") %>%
mutate(Group = "X") %>%
rename(Co_X = Co,
Ni_X = Ni,
Cu_X = Cu,
Zn_X = Zn,
As_X = As,
Cd_X = Cd,
Pb_X = Pb) %>%
select(-Group, -Sample, -Sample_Type)
# a df with no XX
AllNotX <-
Data_Set %>%
filter(Group != "XX")
# Make a list of DF's by Group
ListofGroupDFs <-
AllNotX %>%
split(.$Group)
# glimpse(ListofGroupDFs)
ListofGroupDFs <- map(ListofGroupDFs, ~ inner_join(., JustGroupX, by = "Sample_ID"))
# this part is inelegant since it simply repeats the same code for each metal
# I'll try and make it prettier another day
CoResults <-
map_dfr(ListofGroupDFs,
~ tidy(cor.test(.$Co, .$Co_X, method = "kendall")),
.id = "ComparedwithX") %>%
mutate(Metal = "Co")
CoResults$Metal <- "Co"
NiResults <-
map_dfr(ListofGroupDFs,
~ tidy(cor.test(.$Ni, .$Ni_X, method = "kendall")),
.id = "ComparedwithX") %>%
mutate(Metal = "Ni")
NiResults$Metal <- "Ni"
CuResults <-
map_dfr(ListofGroupDFs,
~ tidy(cor.test(.$Cu, .$Cu_X, method = "kendall")),
.id = "ComparedwithX") %>%
mutate(Metal = "Cu")
CuResults$Metal <- "Cu"
ZnResults <-
map_dfr(ListofGroupDFs,
~ tidy(cor.test(.$Zn, .$Zn_X, method = "kendall")),
.id = "ComparedwithX") %>%
mutate(Metal = "Zn")
ZnResults$Metal <- "Zn"
AsResults <-
map_dfr(ListofGroupDFs,
~ tidy(cor.test(.$As, .$As_X, method = "kendall")),
.id = "ComparedwithX") %>%
mutate(Metal = "As")
AsResults$Metal <- "As"
CdResults <-
map_dfr(ListofGroupDFs,
~ tidy(cor.test(.$Cd, .$Cd_X, method = "kendall")),
.id = "ComparedwithX") %>%
mutate(Metal = "Cd")
CdResults$Metal <- "Cd"
PbResults <-
map_dfr(ListofGroupDFs,
~ tidy(cor.test(.$Pb, .$Pb_X, method = "kendall")),
.id = "ComparedwithX") %>%
mutate(Metal = "Pb")
PbResults$Metal <- "Pb"
MyResults <- rbind(CoResults,
NiResults,
CuResults,
ZnResults,
AsResults,
CdResults,
PbResults)
MyResults <-
MyResults %>%
rename(tau = estimate, T = statistic) %>%
select(Metal,
ComparedwithX,
tau,
T,
p.value)
MyResults
#> # A tibble: 14 x 5
#> Metal ComparedwithX tau T p.value
#> <chr> <chr> <dbl> <dbl> <dbl>
#> 1 Co 2A 0.0222 23 1
#> 2 Co 2D 0.0667 24 0.862
#> 3 Ni 2A 0.444 26 0.119
#> 4 Ni 2D -0.111 20 0.727
#> 5 Cu 2A -0.2 18 0.484
#> 6 Cu 2D 0.0667 24 0.862
#> 7 Zn 2A 0.289 29 0.291
#> 8 Zn 2D 0.156 26 0.601
#> 9 As 2A -0.0222 22 1
#> 10 As 2D -0.422 13 0.108
#> 11 Cd 2A 0.2 27 0.484
#> 12 Cd 2D -0.0667 21 0.862
#> 13 Pb 2A -0.333 15 0.216
#> 14 Pb 2D 0.2 27 0.484
Upvotes: 1