Set color and boxes on conditions from data set (boxplot)

Question

So I have a dataset called Carbon (Carbon$Graphite does contain a few values not seen in the head):

       Mesa  TIC   TOC Graphite    TC
Kx V17   Ja 5.26 15.64       NA 20.90
Mu V17  Nej 4.08 11.32       NA 15.40
Ob V17  Nej 5.22 12.68       NA 17.90
Vä V17   Ja 6.45  6.35       NA 12.80
Ös V17  Nej 3.90  2.54       NA  6.44
Ig V17   Ja 8.20  3.20       NA 11.40

I would like a boxplot that displays 8 boxes, one box that only contains values that fufills Carbon$TIC[Carbon$Mesa=="Ja, ], one that equals Carbon$TIC[Carbon$Mesa=="Ja, ], one that equals Carbon$TOC[Carbon$Mesa=="Ja, ] and so fort. And the colour (fill) of the boxes is Carbon$Mesa so "Ja"=red box and "Nej"=blue box. I have managed to do this without using ggpplot, but I need to do this with ggplot (so all my graphs look the same and the rest of the graps are done with ggplot).

The code I used to make it without ggplot (this is what I want but with legend on the side as usual for ggplot):

MesaJa <-Carbon[Carbon$Mesa=="Ja", ]
MesaNej <-Carbon[Carbon$Mesa=="Nej", ]

col.box<- c( rep("red", 3),   rep("blue", 3))

boxplot( list(MesaJa [, "TIC"], MesaJa [, "TOC"], MesaJa [, "TC"], 
              MesaNej[, "TIC"], MesaNej[, "TOC"], MesaNej[, "TC"] ),
         names=c("TIC", "TOC", "TC","TIC", "TOC", "TC") , 
         col=col.box
)  
legend("topleft", legend= c("Lime mud", "No lime mud"), pch=19, col=c("red","blue"), cex=0.7)

I have tried several different approaches but still not got it to work. The closest I got was this:

Carbon$TIC_Ja <- ifelse(Carbon$Mesa=="Ja",Carbon$TIC, NA)
Carbon$TIC_Nej <- ifelse(Carbon$Mesa=="Nej",Carbon$TIC, NA)
Carbon$TOC_Ja <- ifelse(Carbon$Mesa=="Ja",Carbon$TOC, NA)
Carbon$TOC_Nej <- ifelse(Carbon$Mesa=="Nej",Carbon$TOC, NA)
Carbon$TC_Ja <- ifelse(Carbon$Mesa=="Ja",Carbon$TC, NA)
Carbon$TC_Nej <- ifelse(Carbon$Mesa=="Nej",Carbon$TC, NA)

Carbon.plot<-Carbon[ , c(1, 6:11)]
Carbon.key <- colnames(Carbon)  

ggplot(
  gather(Carbon.plot, key=Carbon.key, value="value", -"Mesa"),
  aes(x=factor(Carbon.key), y=as.numeric(value), fill= Carbon.key) 
  ) + 
  geom_boxplot() +
  scale_fill_manual(values=c("red", "blue", "red", "blue", "red", "blue"),
                    labels=c("Lime mud added", "No lime mud")
                    )

But it is still not good as I only want the upper two post in the legend and want to remove the "NA". And I think that there must be an easier way that does not involve using ifelse to sort the data frame. I have scoured SO but so far have not seen any example that is similar. So, please help?

Edit: added info on the df and session info. But After posting the question I started to update some packages and well, things did not go well so currently I dont even have ggplot working.

> str(Carbon)
'data.frame':   70 obs. of  5 variables:
 $ Mesa    : chr  "Ja" "Nej" "Nej" "Ja" ...
 $ TIC     : num  5.26 4.08 5.22 6.45 3.9 ...
 $ TOC     : num  15.64 11.32 12.68 6.35 2.54 ...
 $ Graphite: num  NA NA NA NA NA NA NA NA NA NA ...
 $ TC      : num  20.9 15.4 17.9 12.8 6.44 11.4 12.9 21.6 11.8 15.3 ...

> dput(Carbon)
structure(list(Mesa = c("Ja", "Nej", "Nej", "Ja", "Nej", "Ja", 
"Ja", "Ja", "Ja", "Ja", "Nej", "Nej", "Nej", "Ja", "Ja", "Nej", 
"Nej", "Nej", "Nej", "Ja", "Ja", "Ja", "Ja", "Nej", "Nej", "Nej", 
"Ja", "Ja", "Ja", "Nej", "Nej", "Nej", "Nej", "Ja", "Ja", "Ja", 
"Ja", "Nej", "Nej", "Nej", "Ja", "Ja", "Ja", "Nej", "Nej", "Nej", 
"Ja", "Nej", "Ja", "Ja", "Ja", "Ja", "Nej", "Nej", "Nej", "Ja", 
"Ja", "Ja", "Nej", "Nej", "Nej", "Nej", "Ja", "Ja", "Ja", "Ja", 
"Nej", "Nej", "Nej", "Ja"), TIC = c(5.26, 4.08, 5.22, 6.45, 3.9, 
8.2, 10.67, 7.43, 9.55, 8.19, 7.83, 4.04, 2.66, 4.93, 7.41, 3.25, 
4.47, 4.385, 3.48, 8.01, 9.49, 8.93, 6.03, 7.32, 3.84, 2.42, 
5.01, 3.87, 7, 4.8, 5.64, 5.76, 5.69, 8.7, 10.2, 9.78, 6.1, 8.07, 
4.33, 3.98, 7.39, 9.68, 9.67, 3.75, 5.07, 4.7, 4.86, 2.98, 8.05, 
10.29, 9.99, 6.65, 8.85, 4.82, 3.84, 8.49, 3.86, 6.63, 3.49, 
3.01, 4.83, 3.78, 8.95, 10.1, 8.15, 6.16, 8.15, 4.27, 3.96, 4.96
), TOC = c(15.64, 11.32, 12.68, 6.35, 2.54, 3.2, 2.23, 14.17, 
2.25, 7.11, 2.37, 17.16, 36.14, 13.47, 5.29, 17.95, 14.63, 3.85, 
6.31, 3.19, 2.81, 7.27, 23.07, 1.94, 26.19, 36.63, 23.19, 11.37, 
5.1, 2.39, 18.46, 2.17, 2.45, 1.7, 4.2, 4.92, 20.2, 0.86, 20.67, 
33.32, 5.11, 2.01, 0.53, 6.48, 29.51, 2.5, 9.41, 3.42, 3.04, 
4, 4.1, 11.15, 1.94, 20.66, 31.73, 1.81, 16.39, 20.75, 6.61, 
33.98, 2.48, 3.15, 1.65, 2.4, 12.63, 17.33, 0.99, 25.62, 38.63, 
17.93), Graphite = c(NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 
NA, NA, NA, NA, NA, NA, NA, NA, 0.13, NA, NA, NA, NA, NA, 0.07, 
0.05, NA, 0.06, NA, 0.03, NA, NA, NA, NA, NA, NA, NA, NA, NA, 
NA, NA, NA, NA, 0.07, 0.02, 0.08, 0.03, 0.33, NA, NA, NA, NA, 
NA, 0.02, 0.13, NA, 0.05, 0.02, NA, NA, NA, NA, NA, NA, NA, NA, 
NA, NA, NA, NA), TC = c(20.9, 15.4, 17.9, 12.8, 6.44, 11.4, 12.9, 
21.6, 11.8, 15.3, 10.2, 21.2, 38.8, 18.4, 12.7, 21.2, 19.1, 8.235, 
9.92, 11.2, 12.3, 16.2, 29.1, 9.26, 30.1, 39.1, 28.2, 15.3, 12.1, 
7.22, 24.1, 7.93, 8.14, 10.4, 14.4, 14.7, 26.3, 8.93, 25, 37.3, 
12.5, 11.7, 10.2, 10.3, 34.6, 7.28, 14.3, 6.73, 11.1, 14.3, 14.1, 
17.8, 10.8, 25.5, 35.7, 10.3, 20.3, 27.4, 10.1, 37, 7.33, 6.93, 
10.6, 12.5, 20.8, 23.5, 9.14, 29.9, 42.6, 22.9)), class = "data.frame", row.names = c("Kx V17", 
"Mu V17", "Ob V17", "Vä V17", "Ös V17", "Ig V17", "Va V17", "Gä V17", 
"Sk V17", "Fr V17", "Gr V17", "Bi V17", "As V17", "Kx H17", "Pi H17", 
"Mu H17", "Ob H17", "Do H17", "Ös H17", "Ig H17", "Va H17", "Gä H17", 
"Fr H17", "Gr H17", "Bi H17", "As H17", "So H17", "Kx V18", "Pi V18", 
"Mu V18", "Ob V18", "Do V18", "Ös V18", "Ig V18", "Va V18", "Gä V18", 
"Fr V18", "Gr V18", "Bi V18", "As V18", "So V18", "Kx H18", "Pi H18", 
"Mu H18", "Ob H18", "Do H18", "Vä H18", "Ös H18", "Ig H18", "Va H18", 
"Gä H18", "Fr H18", "Gr H18", "Bi H18", "As H18", "So H18", "Kx V19", 
"Pi V19", "Mu V19", "Ob V19", "Do V19", "Ös V19", "Ig V19", "Va V19", 
"Gä V19", "Fr V19", "Gr V19", "Bi V19", "As V19", "So V19"))

> sessionInfo()
R version 3.5.3 (2019-03-11)
Platform: x86_64-w64-mingw32/x64 (64-bit)
Running under: Windows 10 x64 (build 18362)

Matrix products: default

locale:
[1] LC_COLLATE=Swedish_Sweden.1252  LC_CTYPE=Swedish_Sweden.1252   
[3] LC_MONETARY=Swedish_Sweden.1252 LC_NUMERIC=C                   
[5] LC_TIME=Swedish_Sweden.1252    

attached base packages:
[1] stats     graphics  grDevices utils     datasets  methods   base     

loaded via a namespace (and not attached):
 [1] crayon_1.3.4     grid_3.5.3       R6_2.4.1         lifecycle_0.2.0  gtable_0.3.0    
 [6] magrittr_1.5     scales_1.1.0     rlang_0.4.5      rstudioapi_0.11  ellipsis_0.3.0  
[11] tools_3.5.3      glue_1.4.0       munsell_0.5.0    compiler_3.5.3   colorspace_1.4-1

dc37 · Accepted Answer

Here a possible way is to pivot your dataframe into a longer format using for example pivot_longer function from tidyr (pivot_longer is available in tidyr 1.0.0):

library(tidyr)
library(dplyr)

Carbon %>% pivot_longer(cols = c(TIC, TOC, TC), names_to = "var",values_to = "val") %>%
  mutate(var = factor(var,levels = c("TIC","TOC","TC"))) 

# A tibble: 210 x 4
   Mesa  Graphite var     val
         
 1 Ja          NA TIC    5.26
 2 Ja          NA TOC   15.6 
 3 Ja          NA TC    20.9 
 4 Nej         NA TIC    4.08
 5 Nej         NA TOC   11.3 
 6 Nej         NA TC    15.4 
 7 Nej         NA TIC    5.22
 8 Nej         NA TOC   12.7 
 9 Nej         NA TC    17.9 
10 Ja          NA TIC    6.45
# … with 200 more rows

Then, you can use interaction to group "Mesa" and "var" as x values and get the following boxplot in ggplot2:

library(tidyr)
library(dplyr)
library(ggplot)

Carbon %>% pivot_longer(cols = c(TIC, TOC, TC), names_to = "var",values_to = "val") %>%
  mutate(var = factor(var,levels = c("TIC","TOC","TC"))) %>%
  ggplot(aes(x = interaction(var, Mesa), y = val, fill = Mesa))+
  geom_boxplot()+
  scale_x_discrete(labels = rep(c("TIC","TOC","TC"),2))+
  scale_fill_manual(values = c("red","blue"), labels = c("Lime mud", "No lime mud"))

Does it answer your question ?

Set color and boxes on conditions from data set (boxplot)

Answers (1)

Related Questions