RHT
RHT

Reputation: 21

Is there an R function to do multiple correlations rather than rewritting the code block

I'm currently having to subset data in multiple ways to extract the data I need to do a correlation and identify if there is a relationship between the same metals of the groups.

Group1 <- subset(Data_Set, subset = Data_Set$Sample == "1")
Group1A <- subset(Group1, subset = Group1$Sample_Type == "A")
GroupX <- subset(Data_Set, subset = Data_Set$Sample == "X")
GroupX<- subset(GroupX, Sample_ID %in% Group1A$Sample_ID )
cor.test(Group1A$Pb,GroupX$Pb, method = "kendall")

ID is used to match between groups

However, this is very inefficient. I have in total 6 groups which have subgroups (between 1-6) composed of 1-40 samples and I looking to see if there is a relationship between any of these groups and GroupX. Is there a function to speed this up.

The output would be something along the lines of:

Pb                 T     p-value    tau                   
Group1A~GroupX    340     0.001    0.5902
Group1B~GroupX    435     0.03     0.2344
.....
Group6C~GroupX    344     0.001    0.4566

And this would be repeated for 5 other metals

I was thinking a standard correlation matrix but this does correlations between metals within a group.

Thanks!

EDIT: Sample data as requested

structure(list(Sample = c("2", "2", "2", "2", "X", "2", "2", 
"2", "2", "2", "2", "X", "2", "2", "5", "5", "5", "5", "5", "X", 
"5", "5", "3", "3", "X", "3", "3", "X", "4", "4"), Sample_ID = c("DC001", 
"DC001", "DC001", "DC001", "DC001", "DC001", "DC001", "DC002", 
"DC002", "DC002", "DC002", "DC002", "DC002", "DC002", "DC003", 
"DC003", "DC003", "DC003", "DC003", "DC003", "DC003", "DC003", 
"DC004", "DC004", "DC004", "DC005", "DC005", "DC005", "DC006", 
"DC006"), Sample_Type = c("A", "D", "E", "F", "X", "I", "J", 
"A", "D", "E", "F", "X", "I", "J", "A", "B", "D", "E", "F", "X", 
"I", "J", "C", "F", "X", "C", "F", "X", "A", "D"), Co = c(0, 
0.204473214269861, 0, 0.50977856054987, 0.262230521160956, 0, 
0, 0, 0, 0, 0.465855303428853, 0.229502158969648, 0.214970121592712, 
0.588126362402572, 0, 0.0906122639531158, 0.229838105464066, 
0, 0.240533898070871, 4.77802122014029, 0.47537095149254, 0.384495379166814, 
0.00135414270258444, 0.458235177876183, 0.412977043885698, 0.187579567424379, 
0.317854941692133, 0.0271598068567071, 0, 0.293328743450483), 
    Ni = c(2.32894078024542, 0, 2.75976812547636, 2.35251746719724, 
    0.351631195258774, 1.25476391714642, 0.0586626807902249, 
    0, 2.31716731851309, 0, 4.03426936736104, 0.414520597983989, 
    2.69897385721456, 0.781651988488391, 1.48260693680732, 1.59083944326126, 
    0.944038748319438, 3.06889126279262, 1.69552165261712, 0.849220149877567, 
    1.75387912556474, 0, 0.333762199305291, 1.66187141150986, 
    0.735834552887327, 3.72419677755011, 1.27862769479216, 0.264762516047524, 
    1.84288031704096, 1.8828793053893), Cu = c(16.6696573471153, 
    21.377014252538, 16.4581203986139, 6.49438237470201, 1.57054125960644, 
    5.67180974109468, 23.5835333332964, 38.6483288663375, 15.2589198442198, 
    21.9746392829346, 7.09307693625389, 0.967127488045321, 6.32542891436958, 
    16.1173426649179, 11.2222721930992, 8.42093833910001, 11.1332246071585, 
    16.7442343774396, 10.8140656299147, 14.2632807636599, 5.35502290473828, 
    7.29141216675894, 2.53789491234011, 16.5791995430022, 1.00648647764661, 
    26.6313784234462, 0.0413060789264422, 0.656674377606213, 
    3.98095036332964, 6.17760205144632), Zn = c(76.5281110975817, 
    2652.50181007495, 1007.00556337852, 206.99812727191, 640.15733114957, 
    484.221162531697, 3718.61286231799, 131.574098527507, 9826.49966864988, 
    1827.75831773692, 557.015412652748, 850.519284594127, 955.085171501707, 
    3039.23169926716, 117.947177178762, 65.7886442827721, 78.1092625035093, 
    253.691311074245, 980.544294923672, 506.400193234096, 1110.92409209043, 
    902.659801267825, 284.143460051779, 991.762202132739, 899.71040333897, 
    1686.99915717559, 27.0835877755038, 956.364728487396, 142.167067778216, 
    1012.61495002819), As = c(0, 0, 1.91185052013389, 1.32808264279786, 
    0.141039242323703, 1.74872331719823, 0.1065340816859, 0.812367854870543, 
    0, 0.797230094696634, 2.38925992872935, 0.305621793073037, 
    0.664951374730799, 0, 0, 0, 2.52051964809224, 0, 0, 0.392178178336116, 
    0, 3.08334159340895, 2.32108729394528, 1.62081021652742, 
    0.171200134084414, 6.19125023716284, 4.43213876523911, 0.289386770990403, 
    0.313331113399545, 6.41607755268465), Cd = c(8.22465741493669, 
    22.6126042664945, 34.0150873273517, 13.5844058876617, 5.22665850051452, 
    24.0465414683255, 109.478598702669, 15.1992477278811, 169.517190223851, 
    75.2983940524065, 34.5230481628261, 3.75297525105592, 45.6178498733986, 
    247.435132822196, 2.10793502840313, 1.47647473271431, 0.0848090794945706, 
    2.98717760781629, 3.13384011407655, 5.31936421369202, 3.73593799828465, 
    5.36310372449921, 0.298562637256625, 1.82673831232711, 3.78462211601718, 
    8.0628550389363, 0.138799690323038, 1.32275598609847, 0.285061500560821, 
    0.635235209786838), Pb = c(0.922803462498185, 5.13959353157866, 
    1.9525414480789, 0, 2.5902978681043, 1.21865949505257, 7.09067896476338, 
    0, 3.89524247237658, 0.354938950934777, 2.64634863087263, 
    0.356658949506862, 1.25701617111933, 4.18799241835111, 0, 
    0.807369345092201, 0.0263264119388502, 0, 3.32333444396018, 
    76.7555925603143, 0.613522400825461, 0, 1.72315815094652, 
    3.21414903849599, 1.03802696495681, 1.73176109371547, 0.72736174943572, 
    0.23309888503164, 12.8688959655249, 33.2486209089115)), row.names = c(NA, 
-30L), class = c("tbl_df", "tbl", "data.frame"))

Upvotes: 1

Views: 123

Answers (1)

Chuck P
Chuck P

Reputation: 3923

Here's a solution that get's you exactly what you want. It looks a little long and forbidding but I've tried to make it easy to follow and to expand and bullet it proof to missing observations, and NAs etc.. I also have a brute force solution that does **all* the correlations then removes all those you don't need.

library(dplyr)
library(stringr)
library(purrr)
library(broom)

# made up data set that is similar to yours but with missing rows and NAs
set.seed(2020)
Data_Set <-
  data.frame(
    Sample = c(rep("X", times = 10), rep("2", times = 20), "X", "2"),
    Sample_ID = c(rep(c("DC001", "DC002", "DC003", "DC004", "DC005", "DC006", "DC007", "DC008", "DC009", "DC010"), times = 3), "DC011", "DC012"),
    Sample_Type = c(rep("X", times = 10), rep("A", times = 10), rep("D", times = 10), "X", "A"),
    Co = runif(32, 0, 5),
    Ni = runif(32, 0, 4.1),
    Cu = runif(32, 0, 39),
    Zn = runif(32, 27, 9800),
    As = runif(32, 0, 6),
    Cd = runif(32, 0, 247),
    Pb = runif(32, 0, 78)
  )

Data_Set[15,5] <- NA

# Data_Set

# Collapse Sample and Sample_Type into one Group variable
Data_Set <-
  Data_Set %>%
  mutate(Group = str_c(Sample, Sample_Type)) %>%
  select(Group, everything())

# Pull out Group XX (our baseline) and relabel
# the metals with an _X on the end

JustGroupX <-
  Data_Set %>%
  filter(Group == "XX") %>%
  mutate(Group = "X") %>%
  rename(Co_X = Co,
         Ni_X = Ni,
         Cu_X = Cu,
         Zn_X = Zn,
         As_X = As,
         Cd_X = Cd,
         Pb_X = Pb) %>%
  select(-Group, -Sample, -Sample_Type)

# a df with no XX
AllNotX <-
  Data_Set %>%
  filter(Group != "XX")

# Make a list of DF's by Group
ListofGroupDFs <-
  AllNotX %>%
  split(.$Group)

# glimpse(ListofGroupDFs)

ListofGroupDFs <- map(ListofGroupDFs, ~ inner_join(., JustGroupX, by = "Sample_ID"))

# this part is inelegant since it simply repeats the same code for each metal
# I'll try and make it prettier another day

CoResults <-
  map_dfr(ListofGroupDFs,
          ~ tidy(cor.test(.$Co, .$Co_X, method = "kendall")),
          .id = "ComparedwithX") %>%
  mutate(Metal = "Co")
CoResults$Metal <- "Co"

NiResults <-
  map_dfr(ListofGroupDFs,
          ~ tidy(cor.test(.$Ni, .$Ni_X, method = "kendall")),
          .id = "ComparedwithX") %>%
  mutate(Metal = "Ni")
NiResults$Metal <- "Ni"

CuResults <-
  map_dfr(ListofGroupDFs,
          ~ tidy(cor.test(.$Cu, .$Cu_X, method = "kendall")),
          .id = "ComparedwithX") %>%
  mutate(Metal = "Cu")
CuResults$Metal <- "Cu"

ZnResults <-
  map_dfr(ListofGroupDFs,
          ~ tidy(cor.test(.$Zn, .$Zn_X, method = "kendall")),
          .id = "ComparedwithX") %>%
  mutate(Metal = "Zn")
ZnResults$Metal <- "Zn"

AsResults <-
  map_dfr(ListofGroupDFs,
          ~ tidy(cor.test(.$As, .$As_X, method = "kendall")),
          .id = "ComparedwithX") %>%
  mutate(Metal = "As")
AsResults$Metal <- "As"

CdResults <-
  map_dfr(ListofGroupDFs,
          ~ tidy(cor.test(.$Cd, .$Cd_X, method = "kendall")),
          .id = "ComparedwithX") %>%
  mutate(Metal = "Cd")
CdResults$Metal <- "Cd"

PbResults <-
  map_dfr(ListofGroupDFs,
        ~ tidy(cor.test(.$Pb, .$Pb_X, method = "kendall")),
        .id = "ComparedwithX") %>%
  mutate(Metal = "Pb")
PbResults$Metal <- "Pb"

MyResults <- rbind(CoResults,
                   NiResults,
                   CuResults,
                   ZnResults,
                   AsResults,
                   CdResults,
                   PbResults)

MyResults <-
  MyResults %>%
  rename(tau = estimate, T = statistic) %>%
  select(Metal,
         ComparedwithX,
         tau,
         T,
         p.value)

MyResults
#> # A tibble: 14 x 5
#>    Metal ComparedwithX     tau     T p.value
#>    <chr> <chr>           <dbl> <dbl>   <dbl>
#>  1 Co    2A             0.0222    23   1    
#>  2 Co    2D             0.0667    24   0.862
#>  3 Ni    2A             0.444     26   0.119
#>  4 Ni    2D            -0.111     20   0.727
#>  5 Cu    2A            -0.2       18   0.484
#>  6 Cu    2D             0.0667    24   0.862
#>  7 Zn    2A             0.289     29   0.291
#>  8 Zn    2D             0.156     26   0.601
#>  9 As    2A            -0.0222    22   1    
#> 10 As    2D            -0.422     13   0.108
#> 11 Cd    2A             0.2       27   0.484
#> 12 Cd    2D            -0.0667    21   0.862
#> 13 Pb    2A            -0.333     15   0.216
#> 14 Pb    2D             0.2       27   0.484

Upvotes: 1

Related Questions