Luqman
Luqman

Reputation: 29

How can I convert my R code into a loop and reproducible script?

I have put together an R code for split plot analysis by exploring the internet (thanks to data scientists for making our lives easier). However, the problem now is I have more than a 100 variables to test. Inside this code there are a few instances where I need to change the column/variable name each time, axis names for each figure, and output figure name. Also, when the Anova results are printed on the screen I manually copy the p-values and lsd scores before moving forward. Is there a way to just automate everything to save many hours of manual editing? The code I am using is as below:

data <- read.csv("file.csv", header = TRUE)

data$Genotype <- as.factor(data$Genotype)
data$N.level <- as.factor(data$N.level)

library(agricolae)

attach(data)

Here before running the model I change response variables name each time, also the model output prints anova results which I am copying manually before moving on

model <- sp.plot(block = Block, 
             pplot = Genotype, 
             splot = N.level, 
             Y = GPC.30DAP)

Edf_a <- model$gl.a
Edf_b <- model$gl.b
EMS_a <- model$Ea
EMS_b <- model$Eb

Again I change Y for LSD test each time, and from LSD output manually copy the results of each test 1,2,3

out1 <- LSD.test(y = GPC.30DAP, 
             trt = Genotype,
             DFerror = Edf_a, 
             MSerror = EMS_a,
             alpha = 0.05,
             #p.adj = "bonferroni",
             group = TRUE,
             console = TRUE)

out2 <- LSD.test(y = GPC.30DAP, 
             trt = N.level,
             DFerror = Edf_b, 
             MSerror = EMS_b,
             alpha = 0.05,
             #p.adj = "bonferroni",
             group = TRUE,
             console = TRUE)

out3 <- LSD.test(y = GPC.30DAP, 
             trt = Genotype:N.level,
             DFerror = Edf_b, 
             MSerror = EMS_b,
             alpha = 0.05,
             #p.adj = "bonferroni",
             group = TRUE,
             console = TRUE)

library(dplyr)

ascend_AB = out3$groups %>%
  group_by(rownames(out3$groups)) %>%
  arrange(rownames(out3$groups))

print(ascend_AB)

CHANGE VARIABLE NAME AGAIN

MeanSD_AB = data %>%
  group_by(Genotype, N.level) %>%
  summarise(avg_AB = mean(GPC.30DAP),
            sd = sd(GPC.30DAP))

print(MeanSD_AB)

attach(MeanSD_AB)

##------ Plotting "G*N interaction"

library(ggplot2)

## Create plotting object
p3 = ggplot(MeanSD_AB, aes(x = N.level,
                           y = avg_AB,
                           fill = factor(Genotype)))
print(p3)

## Plotting bars
plotA = p3 + 
  geom_bar(stat = "identity",
           color = "black",
           position = position_dodge(width=0.9))
print(plotA)

## Adding error bars
plotC = plotA + 
  geom_errorbar(aes(ymax = avg_AB + sd,
                    ymin = avg_AB - sd), 
                position = position_dodge(width=0.9), 
                width = 0.25)
print(plotC)

Changing main title, X and Y labels and legend title

plotD = plotC + 
      labs(title = "",
           x = "",
           y = "GPC (%) 30 days after anthesis",
           fill = "Genotype")
print(plotD)

## Adding lettering from test applied
plotE = plotD + 
  geom_text(aes(x = N.level,
                y = avg_AB + sd,
                label = as.matrix(ascend_AB$groups)),
            position = position_dodge(width = 0.9),
            vjust = -(0.5))
print(plotE)

figure title change to avoid overwriting

ggsave("GPC 30 days after anthesis.jpeg", width = 10, height = 10, units = c("cm"), dpi = 300)

Upvotes: 0

Views: 143

Answers (1)

Mikko Marttila
Mikko Marttila

Reputation: 11878

There’s a few things going on here which I think would be useful to go over in some detail. Before diving in, let’s load the example data directly from pastebin:

library(agricolae)

data <- read.delim("https://pastebin.com/raw/kkjZVxsW")
data$Genotype <- as.factor(data$Genotype)
data$N.level <- as.factor(data$N.level)
data
#>    Block Genotype N.level GPC.15DAP GPC.23DAP GPC.30DAP
#> 1      1    1290H   Low N       4.9       2.7       2.6
#> 2      2    1290H   Low N       4.6       3.3       2.3
#> 3      3    1290H   Low N       5.1       2.7       3.7
#> 4      1    1290W   Low N       5.0       2.4       1.9
#> 5      2    1290W   Low N       4.4       3.2       2.7
#> 6      3    1290W   Low N       4.5       3.0       2.4
#> 7      1    1290H  High N       8.2       3.6       4.1
#> 8      2    1290H  High N       7.6       4.4       3.4
#> 9      3    1290H  High N       7.3       4.3       4.5
#> 10     1    1290W  High N       5.5       3.9       2.8
#> 11     2    1290W  High N       6.1       3.3       2.0
#> 12     3    1290W  High N       5.8       3.8       2.4

Looping

You might hope to loop over the variable names along these lines:

for (variable in c("GPC.15DAP", "GPC.30DAP")) {
  sp.plot(Block, Genotype, N.level, variable)
}

This won’t work because variable now contains the name of the variable in a string, rather than the actual value of the variable. Instead, you can get() the value given a name:

variable <- "GPC.30DAP"
with(data, {
  sp.plot(Block, Genotype, N.level, get(variable))
})
#> 
#> ANALYSIS SPLIT PLOT:  get(variable) 
#> Class level information
#> 
#> Genotype     :  1290H 1290W 
#> N.level  :  Low N High N 
#> Block    :  1 2 3 
#> 
#> Number of observations:  12 
#> 
#> Analysis of Variance Table
#> 
#> Response: get(variable)
#>                  Df Sum Sq Mean Sq F value  Pr(>F)  
#> Block             2 0.8600  0.4300     NaN     NaN  
#> Genotype          1 3.4133  3.4133  9.3945 0.09199 .
#> Ea                2 0.7267  0.3633     NaN     NaN  
#> N.level           1 1.0800  1.0800  5.6348 0.07651 .
#> Genotype:N.level  1 0.8533  0.8533  4.4522 0.10249  
#> Eb                4 0.7667  0.1917     NaN     NaN  
#> ---
#> Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#> 
#> cv(a) = 20.8 %, cv(b) = 15.1 %, Mean = 2.9

This gives the correct result, but there’s something not quite right. The name of the response variable given in the output is less than useful: get(variable). That happens because sp.plot() captures the code that was used to pass the arguments, and extracts the name to use for the response variable from that.

To get the name of the response variable to show up, we need some metaprogramming to modify our code before running it. I’ll use a toy example to introduce the concepts, and then we’ll see how they apply to the problem at hand.

The fist component is str2lang(), which takes a string and turns it into a “language object” – some unevaluated code:

code <- str2lang("1 + 1")
code
#> 1 + 1

Next, bquote() can insert our piece of code into another, using a special .() syntax:

bquote(2 * .(code))
#> 2 * (1 + 1)

Finally we have eval(), which takes a piece of code and runs it:

eval(bquote(2 * .(code)))
#> [1] 4

Let’s put the pieces together: first create a language object from the variable name in a string with str2lang(), then insert that into the sp.plot() code with bquote(), and finally eval() the result to run the analysis:

variable <- str2lang("GPC.30DAP")
code <- bquote({
  sp.plot(Block, Genotype, N.level, .(variable))
})
with(data, eval(code))
#> 
#> ANALYSIS SPLIT PLOT:  GPC.30DAP 
#> Class level information
#> 
#> Genotype     :  1290H 1290W 
#> N.level  :  Low N High N 
#> Block    :  1 2 3 
#> 
#> Number of observations:  12 
#> 
#> Analysis of Variance Table
#> 
#> Response: GPC.30DAP
#>                  Df Sum Sq Mean Sq F value  Pr(>F)  
#> Block             2 0.8600  0.4300     NaN     NaN  
#> Genotype          1 3.4133  3.4133  9.3945 0.09199 .
#> Ea                2 0.7267  0.3633     NaN     NaN  
#> N.level           1 1.0800  1.0800  5.6348 0.07651 .
#> Genotype:N.level  1 0.8533  0.8533  4.4522 0.10249  
#> Eb                4 0.7667  0.1917     NaN     NaN  
#> ---
#> Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#> 
#> cv(a) = 20.8 %, cv(b) = 15.1 %, Mean = 2.9

You could use this directly, but it might be helpful to create a little helper function to encapsulate the process. Then you won’t have to think about what’s happening in this fairly complex block of code when you actually call it:

fit_model <- function(variable) {
  variable <- str2lang(variable)
  code <- bquote({
    sp.plot(Block, Genotype, N.level, .(variable))
  })
  eval(code, parent.frame())
}

with(data, fit_model("GPC.30DAP"))

Now we have the tools to execute the loop we envisioned in the beginning:

for (variable in c("GPC.15DAP", "GPC.30DAP")) {
  with(data, fit_model(variable))
}
#> 
#> ANALYSIS SPLIT PLOT:  GPC.15DAP 
#> Class level information
#> 
#> Genotype     :  1290H 1290W 
#> N.level  :  Low N High N 
#> Block    :  1 2 3 
#> 
#> Number of observations:  12 
#> 
#> Analysis of Variance Table
#> 
#> Response: GPC.15DAP
#>                  Df  Sum Sq Mean Sq F value  Pr(>F)   
#> Block             2  0.1350  0.0675     NaN     NaN   
#> Genotype          1  3.4133  3.4133  67.147 0.01457 * 
#> Ea                2  0.1017  0.0508     NaN     NaN   
#> N.level           1 12.0000 12.0000  68.900 0.00115 **
#> Genotype:N.level  1  2.0833  2.0833  11.962 0.02585 * 
#> Eb                4  0.6967  0.1742     NaN     NaN   
#> ---
#> Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#> 
#> cv(a) = 3.9 %, cv(b) = 7.3 %, Mean = 5.75 
#> 
#> 
#> ANALYSIS SPLIT PLOT:  GPC.30DAP 
#> Class level information
#> 
#> Genotype     :  1290H 1290W 
#> N.level  :  Low N High N 
#> Block    :  1 2 3 
#> 
#> Number of observations:  12 
#> 
#> Analysis of Variance Table
#> 
#> Response: GPC.30DAP
#>                  Df Sum Sq Mean Sq F value  Pr(>F)  
#> Block             2 0.8600  0.4300     NaN     NaN  
#> Genotype          1 3.4133  3.4133  9.3945 0.09199 .
#> Ea                2 0.7267  0.3633     NaN     NaN  
#> N.level           1 1.0800  1.0800  5.6348 0.07651 .
#> Genotype:N.level  1 0.8533  0.8533  4.4522 0.10249  
#> Eb                4 0.7667  0.1917     NaN     NaN  
#> ---
#> Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#> 
#> cv(a) = 20.8 %, cv(b) = 15.1 %, Mean = 2.9

Capturing results

Unfortunately sp.plot() simultaneously both fits the model and prints the results. This is not ideal, as when fitting hundreds of models you don’t want the output to just overwhelm your console. We need some additional tools to work around that:

# Helper to discard printed output when evaluating `expr`
silence.output <- function(expr) {
  sink(nullfile())
  on.exit(sink())
  expr
}

Now lapply() is a convenient way to loop and collect the results in a list:

variables <- c("GPC.15DAP", "GPC.30DAP")
models <- lapply(variables, \(variable) {
  with(data, fit_model(variable)) |> silence.output()
})

Inspecting the result, we see we’ve saved all the relevant components:

str(models, 2)
#> List of 2
#>  $ :List of 5
#>   ..$ ANOVA:Classes 'anova' and 'data.frame':    6 obs. of  5 variables:
#>   .. ..- attr(*, "heading")= chr [1:2] "Analysis of Variance Table\n" "Response: GPC.15DAP"
#>   ..$ gl.a : int 2
#>   ..$ gl.b : int 4
#>   ..$ Ea   : num 0.0508
#>   ..$ Eb   : num 0.174
#>  $ :List of 5
#>   ..$ ANOVA:Classes 'anova' and 'data.frame':    6 obs. of  5 variables:
#>   .. ..- attr(*, "heading")= chr [1:2] "Analysis of Variance Table\n" "Response: GPC.30DAP"
#>   ..$ gl.a : int 2
#>   ..$ gl.b : int 4
#>   ..$ Ea   : num 0.363
#>   ..$ Eb   : num 0.192

The ANOVA table is also accessible, which you can use directly instead of copying the results manually:

models[[1]]$ANOVA
#> Analysis of Variance Table
#> 
#> Response: GPC.15DAP
#>                  Df  Sum Sq Mean Sq F value  Pr(>F)   
#> Block             2  0.1350  0.0675     NaN     NaN   
#> Genotype          1  3.4133  3.4133  67.147 0.01457 * 
#> Ea                2  0.1017  0.0508     NaN     NaN   
#> N.level           1 12.0000 12.0000  68.900 0.00115 **
#> Genotype:N.level  1  2.0833  2.0833  11.962 0.02585 * 
#> Eb                4  0.6967  0.1742     NaN     NaN   
#> ---
#> Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

data.frame(
  term = rownames(models[[1]]$ANOVA),
  pval = models[[1]]$ANOVA$`Pr(>F)`
)
#>               term        pval
#> 1            Block         NaN
#> 2         Genotype 0.014567943
#> 3               Ea         NaN
#> 4          N.level 0.001150332
#> 5 Genotype:N.level 0.025851402
#> 6               Eb         NaN

Let’s give LSD.test() the same treatment. An additional complication here is that we need to pass more arguments than just the variable name now. To handle that with ..., we swap bquote() for substitute():

lsd_test <- function(variable, ...) {
  variable <- str2lang(variable)
  code <- substitute({
    LSD.test(variable, ...)
  })
  eval(code, parent.frame())
}

with(data, lsd_test("GPC.30DAP", Genotype, 2, 0.050833, console = TRUE))
#> 
#> Study: GPC.30DAP ~ Genotype
#> 
#> LSD t Test for GPC.30DAP 
#> 
#> Mean Square Error:  0.050833 
#> 
#> Genotype,  means and individual ( 95 %) CI
#> 
#>       GPC.30DAP       std r      LCL      UCL Min Max
#> 1290H  3.433333 0.8524475 6 3.037298 3.829368 2.3 4.5
#> 1290W  2.366667 0.3614784 6 1.970632 2.762702 1.9 2.8
#> 
#> Alpha: 0.05 ; DF Error: 2
#> Critical Value of t: 4.302653 
#> 
#> least Significant Difference: 0.560078 
#> 
#> Treatments with the same letter are not significantly different.
#> 
#>       GPC.30DAP groups
#> 1290H  3.433333      a
#> 1290W  2.366667      b

Now the code to analyse one variable would be:

with(data, {
  model <- fit_model("GPC.30DAP") |> silence.output()
  test1 <- lsd_test("GPC.30DAP", Genotype, model$gl.a, model$Ea)
  test2 <- lsd_test("GPC.30DAP", N.level, model$gl.b, model$Eb)
  test3 <- lsd_test("GPC.30DAP", Genotype:N.level, model$gl.b, model$Eb)
})

Again, a function to encapsulate this would be helpful:

analyze_variable <- function(data, variable, ...) {
  with(data, {
    model <- silence.output(fit_model(variable))

    test1 <- lsd_test(variable, Genotype, model$gl.a, model$Ea, ...)
    test2 <- lsd_test(variable, N.level, model$gl.b, model$Eb, ...)
    test3 <- lsd_test(variable, Genotype:N.level, model$gl.b, model$Eb, ...)

    list(variable = variable, model = model, tests = list(test1, test2, test3))
  })
}

analysis <- analyze_variable(data, "GPC.30DAP")
str(analysis, 2)
#> List of 3
#>  $ variable: chr "GPC.30DAP"
#>  $ model   :List of 5
#>   ..$ ANOVA:Classes 'anova' and 'data.frame':    6 obs. of  5 variables:
#>   .. ..- attr(*, "heading")= chr [1:2] "Analysis of Variance Table\n" "Response: GPC.30DAP"
#>   ..$ gl.a : int 2
#>   ..$ gl.b : int 4
#>   ..$ Ea   : num 0.363
#>   ..$ Eb   : num 0.192
#>  $ tests   :List of 3
#>   ..$ :List of 5
#>   .. ..- attr(*, "class")= chr "group"
#>   ..$ :List of 5
#>   .. ..- attr(*, "class")= chr "group"
#>   ..$ :List of 5
#>   .. ..- attr(*, "class")= chr "group"

Processing results

The analysis object that we return contains all the relevant information for further processing. We could for example produce a similar interaction plot to what you had from this result:

library(tidyverse)

plot_analysis <- function(analysis) {
  means <- analysis$tests[[3]]$means
  groups <- analysis$tests[[3]]$groups

  means |>
    cbind(groups = groups[, 2]) |>
    rename(mean = 1) |>
    rownames_to_column("term") |>
    separate(term, into = c("Genotype", "N.level"), sep = ":") |>
    ggplot(aes(N.level, mean, color = Genotype, group = Genotype)) +
      geom_errorbar(
        aes(ymin = LCL, ymax = UCL),
        position = position_dodge(0.1),
        width = 0.1
      ) +
      geom_line(position = position_dodge(0.1)) +
      geom_point(position = position_dodge(0.1), size = 3) +
      geom_text(
        aes(y = UCL, label = groups),
        position = position_dodge(0.1),
        vjust = -0.5,
        show.legend = FALSE
      ) +
      labs(y = analysis$variable, x = NULL)
}

plot_analysis(analysis)

Or we can make a helper to print the result summaries:

print_analysis <- function(analysis) {
  cat("Split plot ANOVA of", analysis$variable, "\n")
  print(analysis$model$ANOVA)

  test_rows <- list()
  for (test in analysis$tests) {
    row <- cbind(test$parameters, test$statistics)
    test_rows <- c(test_rows, list(row))
  }

  cat("\n")
  print(do.call("rbind", test_rows))

  invisible(analysis)
}

print_analysis(analysis)
#> Split plot ANOVA of GPC.30DAP 
#> Analysis of Variance Table
#> 
#> Response: GPC.30DAP
#>                  Df Sum Sq Mean Sq F value  Pr(>F)  
#> Block             2 0.8600  0.4300     NaN     NaN  
#> Genotype          1 3.4133  3.4133  9.3945 0.09199 .
#> Ea                2 0.7267  0.3633     NaN     NaN  
#> N.level           1 1.0800  1.0800  5.6348 0.07651 .
#> Genotype:N.level  1 0.8533  0.8533  4.4522 0.10249  
#> Eb                4 0.7667  0.1917     NaN     NaN  
#> ---
#> Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#> 
#>          test p.ajusted           name.t ntr alpha   MSerror Df Mean       CV
#>    Fisher-LSD      none         Genotype   2  0.05 0.3633333  2  2.9 20.78522
#>  1 Fisher-LSD      none          N.level   2  0.05 0.1916667  4  2.9 15.09647
#>  2 Fisher-LSD      none Genotype:N.level   4  0.05 0.1916667  4  2.9 15.09647
#>     t.value       LSD
#>    4.302653 1.4973671
#>  1 2.776445 0.7017812
#>  2 2.776445 0.9924686

Now you can write one loop to do all the analyses, and then continue working with the list of results:

# Analyze all but the 3 first variables in data
variables <- tail(names(data), -3)
variables
#> [1] "GPC.15DAP" "GPC.23DAP" "GPC.30DAP"

analyses <- lapply(variables, \(variable) {
  analyze_variable(data, variable)
})

For example, we can save the results of the plots and summaries using the functions we created:

output_dir <- tempdir()

for (analysis in analyses) {
  # Save summaries
  output_file <- file.path(output_dir, paste0(analysis$variable, ".txt"))
  sink(output_file)
  print_analysis(analysis)
  sink()

  # Save plots
  output_file <- file.path(output_dir, paste0(analysis$variable, ".jpeg"))
  jpeg(output_file, width = 10, height = 10, units = "cm", res = 300)
  plot_analysis(analysis) |> print()
  dev.off()
}

list.files(output_dir, pattern = "^GPC")
#> [1] "GPC.15DAP.jpeg" "GPC.15DAP.txt"  "GPC.23DAP.jpeg" "GPC.23DAP.txt" 
#> [5] "GPC.30DAP.jpeg" "GPC.30DAP.txt"

Upvotes: 3

Related Questions