Reputation: 534
I have the following R code, borrowed here, that generates a reproducible tibble
table:
# Install/load packages only if needed
# ************************************
if (!require("pacman")) install.packages("pacman")
pacman::p_load(dplyr, expss, ggplot2, grid, purrr, rlang, tibble)
# Data Generation
# ***************
# Set the seed for reproducibility
set.seed(123)
# Generate random data
n <- 490
PTSD <- sample(c(1, 2, NA), n, replace = TRUE) #class(PTSD) = "numeric"
ANX <- sample(c(1, 2, NA), n, replace = TRUE) #class(ANX) = "numeric"
DEP <- sample(c(1, 2, NA), n, replace = TRUE) #class(DEP) = "numeric"
# Create the data frame
df <- data.frame(PTSD, ANX, DEP) #class(df) = "data.frame"
# Label the values: 1 = Low, 2 = High
expss::val_lab(df$PTSD) = expss::num_lab("1 Low\n2 High")
expss::val_lab(df$ANX) = expss::num_lab("1 Low\n2 High")
expss::val_lab(df$DEP) = expss::num_lab("1 Low\n2 High")
# Create a list of tables for each variable to count 1s, 2s, and NAs
count_results <- list(
PTSD = table(df$PTSD, useNA = "ifany"),
ANX = table(df$ANX, useNA = "ifany"),
DEP = table(df$DEP, useNA = "ifany")
)
# Frequency count and data summary
# ********************************
# Combine the count tables into a single table
count_table <- do.call(rbind, count_results)
# Initialize empty vectors to store results
variable_names <- character()
sample_sizes <- numeric()
# Loop through the test results and extract relevant information
for (variable_name in names(count_results)) {
sample_sizes <- c(sample_sizes, sum(count_results[[variable_name]]))
variable_names <- c(variable_names, variable_name)
}
# Create summary data frame
summary_df <- data.frame(
Variable = variable_names,
N = sample_sizes
)
# Combine the count table and chi-squared summary table by columns
final_result <- cbind(count_table, summary_df)
# Remove Variable column in the middle of the table
final_result <- subset(final_result, select = -c(Variable))
# Combination of CMDs (CMD ≥ 1)
# *****************************
cmd <- c("PTSD","ANX","DEP")
combs <- map(seq_along(cmd),\(n)combn(cmd,n,simplify = FALSE)) |> purrr::flatten()
filts <- rlang::parse_exprs(map_chr(combs,\(x)paste0(x ,'== 2',collapse=' & ')))
filtsnames <- rlang::parse_exprs(map_chr(combs,\(x)paste0(x ,collapse=' + ')))
names(filts) <- filtsnames
output <- purrr::map_int(filts,\(x){
df %>%
mutate(id = row_number())%>%
filter(!!(x))%>%
summarise(
n = n())
} |> pull(n)
)
tibble::enframe(output)
The output of the tibble
table is supposed to show how many out of N = 490
have the following common mental disorders (CMDs), that is PTSD only, ANX only, DEP only, both PTSD and ANX, both PTSD and DEP, both ANX and DEP, and all 3 CMDs:
# A tibble: 7 × 2
name value
<chr> <int>
1 PTSD 167
2 ANX 156
3 DEP 156
4 PTSD + ANX 56
5 PTSD + DEP 52
6 ANX + DEP 51
7 PTSD + ANX + DEP 23
I wanted to visualise the table graphically so I thought about generating a Venn diagram. What I expected to see in the diagram is the following.
Expectation list:
However, whilst none of the codes (examples below) generated any technical errors (i.e., R code errors) none of the packages I tried (VennDiagram
and ggVennDiagram
) showed the expected results (see Expectation list
).
Here below are the 4 codes used to generate 4 different Venn diagrams, none of which gave the results outlined in Expectation list
:
Using package VennDiagram
Version 1
pacman::p_load(VennDiagram)
# Move to new plotting page
grid::grid.newpage()
# Calculate percentages
total_samples <- nrow(df)
percentages <- output / total_samples * 100
venn.plot <- VennDiagram::draw.triple.venn(
area1 = output["PTSD"],
area2 = output["ANX"],
area3 = output["DEP"],
n12 = output["PTSD + ANX"],
n23 = output["ANX + DEP"],
n13 = output["PTSD + DEP"],
n123 = output["PTSD + ANX + DEP"],
category = c("PTSD", "ANX", "DEP"),
fill = c("red", "green", "blue"),
lty = "blank",
cex = rep(1.5,7),
cat.cex = rep(1.5,3),
cat.pos = c(-20,-40,-60),
cat.dist = c(0.05,0.05,0.05),
ind = TRUE,
euler.d =TRUE,
)
grid.draw(venn.plot)
Using package VennDiagram
2
pacman::p_load(VennDiagram)
# Move to new plotting page
grid::grid.newpage()
# Use pre-calculated values from 'output'
VennDiagram::draw.triple.venn(
area1 = output["PTSD"],
area2 = output["ANX"],
area3 = output["DEP"],
n12 = output["PTSD + ANX"] + output["PTSD + ANX + DEP"], # Adjust for overlaps
n23 = output["ANX + DEP"] + output["PTSD + ANX + DEP"], # Adjust for overlaps
n13 = output["PTSD + DEP"] + output["PTSD + ANX + DEP"], # Adjust for overlaps
n123 = output["PTSD + ANX + DEP"],
category = c("PTSD", "ANX", "DEP"),
col = "Red", fill = c("Green", "Yellow", "Blue"),
cex = 1.5, cat.cex = 1.5, cat.pos = c(-20, 20, 180)
)
Using package VennDiagram
3
pacman::p_load(VennDiagram)
# Calculate exclusive counts for Venn diagram
ptsd_only <- output["PTSD"] - output["PTSD + ANX"] - output["PTSD + DEP"] + output["PTSD + ANX + DEP"]
anx_only <- output["ANX"] - output["PTSD + ANX"] - output["ANX + DEP"] + output["PTSD + ANX + DEP"]
dep_only <- output["DEP"] - output["PTSD + DEP"] - output["ANX + DEP"] + output["PTSD + ANX + DEP"]
ptsd_anx <- output["PTSD + ANX"] - output["PTSD + ANX + DEP"]
ptsd_dep <- output["PTSD + DEP"] - output["PTSD + ANX + DEP"]
anx_dep <- output["ANX + DEP"] - output["PTSD + ANX + DEP"]
ptsd_anx_dep <- output["PTSD + ANX + DEP"]
# Move to new plotting page
grid::grid.newpage()
# Create Venn diagram with 3 sets using adjusted values
VennDiagram::draw.triple.venn(
area1 = ptsd_only,
area2 = anx_only,
area3 = dep_only,
n12 = ptsd_anx,
n23 = anx_dep,
n13 = ptsd_dep,
n123 = ptsd_anx_dep,
category = c("PTSD", "ANX", "DEP"),
col = "Red", fill = c("Green", "Yellow", "Blue"),
cex = 1.5, cat.cex = 1.5, cat.pos = c(-20, 20, 180)
)
Using package ggVennDiagram
pacman::p_load(ggVennDiagram)
# Prepare data for Venn diagram
venn_data <- list(
PTSD = which(df$PTSD == 2),
ANX = which(df$ANX == 2),
DEP = which(df$DEP == 2)
)
# Create Venn diagram with ggVennDiagram
ggVennDiagram(venn_data) +
ggplot2::scale_fill_gradient(low = "white", high = "darkgrey") +
theme_void()
My question: Short of drawing by hand the diagram, is there a way to generate a Venn diagram with R that reflects the same results as those found in the tibble
table (see figure below)?
Conditions: The code that generates the tibble
table (tibble::enframe(output)
) should remain the same. The Venn diagram should reflect the results of the tibble
table.
Caveat: Perhaps I am missing the point about Venn diagram and what they represent...
Upvotes: 1
Views: 66
Reputation: 19339
For the draw.triple.venn
function, you need to add the individual totals to get the correct sizes of the sets. Try the following:
venn.plot <- VennDiagram::draw.triple.venn(
area1 = output["PTSD"]+output["PTSD + ANX"]+output["PTSD + DEP"]+output["PTSD + ANX + DEP"],
area2 = output["ANX"]+output["PTSD + ANX"]+output["ANX + DEP"]+output["PTSD + ANX + DEP"],
area3 = output["DEP"]+output["ANX + DEP"]+output["PTSD + DEP"]+output["PTSD + ANX + DEP"],
n12 = output["PTSD + ANX"]+output["PTSD + ANX + DEP"],
n23 = output["ANX + DEP"]+output["PTSD + ANX + DEP"],
n13 = output["PTSD + DEP"]+output["PTSD + ANX + DEP"],
n123 = output["PTSD + ANX + DEP"],
category = c("PTSD", "ANX", "DEP"),
fill = c("red", "green", "blue"),
lty = "blank",
#cex = rep(1.5,7),
cat.cex = rep(1.5,3),
#cat.pos = c(-20,-40,-60),
#cat.dist = c(0.05,0.05,0.05),
ind = TRUE,
euler.d =TRUE
)
If you want to start with the raw data, then you can skip the calculations.
# Create a list for each column, and determine `which` rows have values>=1.
df.lst <- lapply(as.list(df), \(x) which(x>=1))
venn.diagram(df.lst, filename = 'Venn_3set_simple.tiff')
This differs from the expected, probably due to the way you calculated the totals and intersections.
Upvotes: 1
Reputation: 3864
Here is a straightforward way to draw a Venn diagram representing when these disorders are High
.
library(dplyr)
library(ggplot2)
library(ComplexUpset)
set.seed(123)
# Generate random data
n <- 490
PTSD <- sample(c(1, 2, NA), n, replace = TRUE) #class(PTSD) = "numeric"
ANX <- sample(c(1, 2, NA), n, replace = TRUE) #class(ANX) = "numeric"
DEP <- sample(c(1, 2, NA), n, replace = TRUE) #class(DEP) = "numeric"
# Create the data frame
disorders <- c('PTSD', 'ANX', 'DEP')
df <- data.frame(PTSD, ANX, DEP)
# Create boolean indicators where "High" == TRUE
df[disorders] = df[disorders] == 2
# Either drop rows with NA
# df = na.omit(df)
# or impute missing values
df[is.na(df)] <- FALSE
glimpse(df)
#> Rows: 490
#> Columns: 3
#> $ PTSD <lgl> FALSE, FALSE, FALSE, TRUE, FALSE, TRUE, TRUE, TRUE, FALSE, FALSE,…
#> $ ANX <lgl> TRUE, TRUE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALS…
#> $ DEP <lgl> FALSE, TRUE, FALSE, FALSE, TRUE, TRUE, TRUE, FALSE, FALSE, FALSE,…
ggplot() +
theme_void() +
coord_fixed() +
geom_venn_circle(df, sets = disorders, size = 1) +
geom_venn_label_set(df, sets = disorders, aes(label = region), outwards_adjust = 2) +
geom_venn_label_region(df, sets = disorders, aes(label = size))
The intersection sizes here don't match your list of expectations because you misdefine them
Intersection of all 3 = 23
Correct
Intersection of ANX and DEP = 51
Intersection of PTSD and DEP = 52
Intersection of PTSD and ANX = 56
Only if you include those 23 where all disorders are high
DEP only = 156
ANX only = 156
PTSD only = 167
None of these match the number of rows where only that disorder is high. Compare the results in the Venn diagrams with a table of counts.
df |>
count(PTSD, ANX, DEP)
#> PTSD ANX DEP n
#> 1 FALSE FALSE FALSE 147
#> 2 FALSE FALSE TRUE 76
#> 3 FALSE TRUE FALSE 72
#> 4 FALSE TRUE TRUE 28
#> 5 TRUE FALSE FALSE 82
#> 6 TRUE FALSE TRUE 29
#> 7 TRUE TRUE FALSE 33
#> 8 TRUE TRUE TRUE 23
Upvotes: 1