Nova
Nova

Reputation: 5891

Tidyverse/faster solution to conditional formatting with openxlsx in R?

I'm working with genetic data that look like this table, but larger:

ID allele.a allele.b
A      115       90
A      115       90
A      116       90
B      120       82
B      120       82
B      120      82M

My goal is to highlight, for each ID, which alleles don't match the alleles listed on the first row of each ID group. I need to export the data to a nicely formatted excel file.

Here's what I want:

Desired result

I can get there with the following script, but the actual script involves about 67 "ID"s, 1000 rows of data, and 37 columns. It takes about 5 minutes to run, so I'm hoping to find a solution that significantly decreases processing time. Maybe a "do" solution from the tidyverse - not sure how that would look.

Here's my script, including a test data.frame. Also including a bigger test data.frame for speed testing.

library(xlsx)
library(openxlsx)
library(tidyverse)

# Small data.frame
dframe <- data.frame(ID = c("A", "A", "A", "B", "B", "B"),
                     allele.a = c("115", "115", "116", "120", "120", "120"),
                     allele.b = c("90", "90", "90", "82", "82", "82M"),
                     stringsAsFactors = F)

# Bigger data.frame for speed test
# dframe <- data.frame(ID = rep(letters, each = 30),
#                      allele.a = rep(as.character(round(rnorm(n = 30, mean = 100, sd = 0.3), 0)), 26),
#                      allele.b = rep(as.character(round(rnorm(n = 30, mean = 90, sd = 0.3), 0)), 26),
#                      allele.c = rep(as.character(round(rnorm(n = 30, mean = 80, sd = 0.3), 0)), 26),
#                      allele.d = rep(as.character(round(rnorm(n = 30, mean = 70, sd = 0.3), 0)), 26),
#                      allele.e = rep(as.character(round(rnorm(n = 30, mean = 60, sd = 0.3), 0)), 26),
#                      allele.f = rep(as.character(round(rnorm(n = 30, mean = 50, sd = 0.3), 0)), 26),
#                      allele.g = rep(as.character(round(rnorm(n = 30, mean = 40, sd = 0.3), 0)), 26),
#                      allele.h = rep(as.character(round(rnorm(n = 30, mean = 30, sd = 0.3), 0)), 26),
#                      allele.i = rep(as.character(round(rnorm(n = 30, mean = 20, sd = 0.3), 0)), 26),
#                      allele.j = rep(as.character(round(rnorm(n = 30, mean = 10, sd = 0.3), 0)), 26),
#                      stringsAsFactors = F)



# Create a new excel workbook ----
wb <- createWorkbook()

# Add a worksheets
addWorksheet(wb, sheet = 1, gridLines = TRUE)

# add the data to the worksheet        
writeData(wb, sheet = 1, dframe, rowNames = FALSE)      

# Create a style to show alleles that do not match the first row.
style_Red_NoMatch <- createStyle(fontColour = "#FFFFFF", # white text
                                 bgFill = "#CC0000", # Dark red background
                                 textDecoration = c("BOLD")) # bold text

Groups <- unique(dframe$ID)

start_time <- Sys.time()
# For each unique group, 
for(i in 1:length(Groups)){

  # Print a message telling us where the script is processing in the file.
  print(paste("Formatting unique group ", i, "/", length(Groups), sep = ""))

  # What are the allele values of the *first* individual in the group?
  Allele.values <- dframe %>% 
    filter(ID == Groups[i]) %>% 
    slice(1) %>% 
    select(2:ncol(dframe)) %>% 
    as.character()

  # for each column that has allele values in it,
  for (j in 1:length(Allele.values)){
    # format the rest of the rows so that a value that does not match the first value gets red style


    conditionalFormatting(wb, sheet = 1, 
                          style_Red_NoMatch, 
                          rows = (which(dframe$ID == Groups[i]) + 1), 
                          cols = 1+j,  rule=paste("<>\"", Allele.values[j], "\"", sep = ""))
  }

}
end_time <- Sys.time()
end_time - start_time

saveWorkbook(wb, "Example.xlsx", overwrite = TRUE)

Upvotes: 5

Views: 1527

Answers (1)

MKa
MKa

Reputation: 2318

I guess one way to improve is to apply conditionalFormatting on the entire column instead of having to loop through each cell.

Here is one way. One downside with this approach though is that it creates logical vector of TRUE and FALSE which is used for conditionalFormatting. These columns can be hidden using setColWidths function.

Data

library(openxlsx)

 dframe <- data.frame(ID = rep(letters, each = 30),
                      allele.a = rep(as.character(round(rnorm(n = 30, mean = 100, sd = 0.3), 0)), 26),
                      allele.b = rep(as.character(round(rnorm(n = 30, mean = 90, sd = 0.3), 0)), 26),
                      allele.c = rep(as.character(round(rnorm(n = 30, mean = 80, sd = 0.3), 0)), 26),
                      allele.d = rep(as.character(round(rnorm(n = 30, mean = 70, sd = 0.3), 0)), 26),
                      allele.e = rep(as.character(round(rnorm(n = 30, mean = 60, sd = 0.3), 0)), 26),
                      allele.f = rep(as.character(round(rnorm(n = 30, mean = 50, sd = 0.3), 0)), 26),
                      allele.g = rep(as.character(round(rnorm(n = 30, mean = 40, sd = 0.3), 0)), 26),
                      allele.h = rep(as.character(round(rnorm(n = 30, mean = 30, sd = 0.3), 0)), 26),
                      allele.i = rep(as.character(round(rnorm(n = 30, mean = 20, sd = 0.3), 0)), 26),
                      allele.j = rep(as.character(round(rnorm(n = 30, mean = 10, sd = 0.3), 0)), 26),
                      stringsAsFactors = F)

First part of the script is unchanged.

# Create a new excel workbook ----
wb <- createWorkbook()

# Add a worksheets
addWorksheet(wb, sheet = 1, gridLines = TRUE)
    
# Create a style to show alleles that do not match the first row.
style_Red_NoMatch <- createStyle(fontColour = "#FFFFFF", # white text
                                 bgFill = "#CC0000", # Dark red background
                                 textDecoration = c("BOLD")) # bold text

Then identify first row for each ID and merge into the original dataset. Then check to see if there are any changes in any cells (loops through each column).

# selects first row for each ID which will be used as benchmark
first_row <- dframe[!duplicated(dframe$ID), ]

# Creating new df with the first_row columns added
dframe_chk <- merge(dframe, first_row, by = "ID",  all.x = TRUE, suffixes = c("", "_first"))

# Adding TRUE/FALSE factor for each column to see if it matches or not (-1 to exclude ID column)
for (j in names(dframe)[-1]) {
  
  dframe_chk[, paste0(j, "_chk")] <- dframe_chk[, j] == dframe_chk[, paste0(j, "_first")]
  
}

# Remove _first columns when exporting into Excel
cols <- names(dframe_chk)[!grepl("_first", names(dframe_chk))]

# add the data to the worksheet        
writeData(wb, sheet = 1, dframe_chk[, cols], rowNames = FALSE)      

# This is for conditional Formatting
# first_row is header
row_start <- 2

# Need to add 1 to cover full range (as first row is header)
row_end <- nrow(dframe) + 1

# first column is ID
col_start <- 2 

# last column as per the original dataset
col_end <- ncol(dframe)

# this is to point to the _chk column.
# Note if you have columns more than A-Z then this needs to be adjusted
rule_col <- LETTERS[col_end + 1] 

# Using the _chk column to apply conditional formula
conditionalFormatting(wb, sheet = 1, 
                      style_Red_NoMatch, 
                      rows = row_start:row_end,
                      cols = col_start:col_end,  
                      rule = paste0(rule_col, "2 = FALSE"))

# Exported file includes _chk columns. Hide these columns.
setColWidths(wb, sheet = 1, cols = (col_end + 1):length(cols), hidden = TRUE)

saveWorkbook(wb, "Example2.xlsx", overwrite = TRUE)

Upvotes: 0

Related Questions