Melanie Baker
Melanie Baker

Reputation: 591

Add kruskall-wallis and paiwise comparisons test outputs to dataframe

I have several boxplots that I am running some statistics on through a for loop.

I want to be able to output the data into a dataframe. i.e. something like this (I've made up random numbers):

For the Kruskal-Wallis:

enter image description here

For the pairwise comparisons:

enter image description here

This is my code:

names <- c('B1','B2','B3')

for (name in names){
  mereLoc <- scaledDaily %>%
    filter(scaledDaily$loc == name)
  mereLoc$year <- as.numeric(mereLoc$year)
  median <- tapply(mereLoc$mAODscale, mereLoc$year, median, na.rm=T)
  print(name)
  print(median)

  krusk <- kruskal.test(mAODscale~year, data=mereLoc)
  print(krusk)

  pairs <- pairwise.wilcox.test(mereLoc$mAODscale, mereLoc$year, p.adjust.method='BH')
  print(pairs)
}

Test Data

here is some test data. its not my actual data, I've just fabricated some.

structure(list(loc = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 
1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 3L, 3L, 3L, 3L, 3L, 
3L, 3L, 3L, 3L), levels = c("B1", "B2", "B3"), class = "factor"), 
    mAODscale = c(0.561948848, 0.851883432, 0.293151829, 0.683807808, 
    0.864472706, 0.380303934, 0.324501709, 0.765022304, 0.900772901, 
    0.204731715, 0.877104175, 0.56367162, 0.206528162, 0.353350116, 
    0.219628257, 0.840723901, 0.716389918, 0.569798858, 0.707583441, 
    0.120064246, 0.275325307, 0.438391155, 0.308969668, 0.350156436, 
    0.886955315, 0.693416677, 0.710065022), year = structure(c(1L, 
    1L, 1L, 2L, 2L, 2L, 3L, 3L, 3L, 1L, 1L, 1L, 2L, 2L, 2L, 3L, 
    3L, 3L, 1L, 1L, 1L, 2L, 2L, 2L, 3L, 3L, 3L), levels = c("1", 
    "2", "3"), class = "factor")), row.names = c(NA, -27L), class = "data.frame")

Upvotes: 2

Views: 58

Answers (2)

Melanie Baker
Melanie Baker

Reputation: 591

Here is my solution:

names <- c('B1','B2','B3','B4','B5','B6','B7','B8',
           'SSSI1','SSSI2','SSSI3',
           "CM1", "CM2", "CM3", "CM4", "CM7", "CM8", "CM9", "CM11",
           "Crose 1", "Crose 2", "Crose 3")

stats <- list()
for (name in names){
  mereLoc <- scaledDaily %>%
    filter(scaledDaily$loc == name)
  mereLoc$year <- as.numeric(mereLoc$year)
  median <- aggregate(mereLoc$mAODscale, list(mereLoc$year), median, na.rm=T)
  
  krusk <- kruskal.test(mAODscale~year, data=mereLoc)
  krusk2 <- tidy(krusk)
  pairs <- pairwise.wilcox.test(mereLoc$mAODscale, mereLoc$year, p.adjust.method='BH')
  pairs2 <- tidy(pairs)

  stats[[name]] <- c("loc"=name, 
                     "Y1_median"=median$x[1], 
                     "Y2_median"=median$x[2],
                     "Y3_median"=median$x[3], 
                     "KWCHiSq"=krusk$statistic,
                     "df"=krusk$parameter,
                     "pvalue"=krusk$p.value, 
                     "Y1Y2"=pairs2[1,3], 
                     "Y1Y3"=pairs2[2,3],  
                     "Y2Y3"=pairs2[3,3])
}

stats <- Reduce(function(x,y) merge(x,y,all=TRUE), stats)
names(stats) <- c('loc','Y1_median','Y2_median','Y3_median',
                  'KW-ChiSq','df','pvalue','Y1-Y2','Y1-Y3','Y2-Y3')

Upvotes: 0

Rui Barradas
Rui Barradas

Reputation: 76402

Here is a way.
If the data is in the long format then it is generally easier to process by tidyverse pipes. In the code below start by splitting by groups of location, run the tests, tidy as a data.frame and assemble the parts.
Reshape the results to wide format when needed.

suppressPackageStartupMessages({
  library(dplyr)
  library(broom)
  library(tidyr)
})

bind_cols(
  scaledDaily %>%
    group_split(loc) %>%
    lapply(\(X) {
      loc <- first(X$loc)
      kruskal.test(mAODscale ~ year, data = X) %>% 
        tidy() %>%
        mutate(Location = loc) %>%
        relocate(Location)
    }) %>%
    do.call(rbind, args = .) %>%
    select(Location, statistic, p.value),
  scaledDaily %>%
    group_split(loc) %>%
    lapply(\(X) {
      X %>%
        group_by(year) %>%
        summarise(median = median(mAODscale)) %>%
        pivot_wider(
          names_from = year,
          names_glue = "Y{year}_median",
          values_from = median
        )
    }) %>%
    do.call(rbind, .) 
) %>% relocate(ends_with("median"), .after = Location)
#> # A tibble: 3 × 6
#>   Location Y1_median Y2_median Y3_median statistic p.value
#>   <fct>        <dbl>     <dbl>     <dbl>     <dbl>   <dbl>
#> 1 B1           0.562     0.684     0.765     0.622   0.733
#> 2 B2           0.564     0.220     0.716     3.20    0.202
#> 3 B3           0.275     0.350     0.710     4.36    0.113


scaledDaily %>%
  group_split(loc) %>%
  lapply(\(X) {
    loc <- first(X$loc)
    pairwise.wilcox.test(X$mAODscale, X$year, p.adjust.method='BH') %>% 
      tidy() %>%
      mutate(Location = loc) %>%
      relocate(Location)
  }) %>%
  do.call(rbind, args = .) %>%
  pivot_wider(
    id_cols = Location,
    names_from = starts_with("group"),
    names_glue = "Y{group1}_Y{group2}",
    values_from = p.value
  )
#> # A tibble: 3 × 4
#>   Location Y2_Y1 Y3_Y1 Y3_Y2
#>   <fct>    <dbl> <dbl> <dbl>
#> 1 B1         1     1     1  
#> 2 B2         0.7   0.7   0.3
#> 3 B3         0.7   0.3   0.3

Created on 2023-08-23 with reprex v2.0.2

Upvotes: 1

Related Questions