Reputation: 113
I have a dataset and want to run a PCA plot. In this plot the observations should be grouped in the same colour based on name
column (habillage = a$name
). Additionally, I want that single observation shows to which group it corresponds in terms of Age
. I found that label = "none"
does not show it, but if I write label = a$Age
nothing changes. Finally, how avoid showing in the legens a black/white text, which duplicates habillage = a$name
?
libary(plyr)
library(dplyr)
library(factoextra)
df<-structure(list(effective_status = structure(c(1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L), .Label = c("ACTIVE", "PAUSED"), class = "factor"),
Age = structure(c(3L, 8L, 6L, 4L, 4L, 5L, 4L, 2L, 4L, 8L,
2L, 2L, 3L, 3L, 3L, 4L, 4L, 4L, 5L, 5L, 5L, 6L, 6L, 6L, 6L,
6L, 6L, 6L, 6L, 6L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 8L, 8L, 8L
), .Label = c("13-17", "18-24", "25-34", "35-44", "45-54",
"55-64", "65+", "Unknown"), class = "factor"), name = structure(c(19L,
23L, 18L, 22L, 9L, 6L, 6L, 9L, 15L, 14L, 12L, 14L, 12L, 13L,
15L, 10L, 11L, 20L, 9L, 13L, 19L, 6L, 9L, 10L, 13L, 14L,
19L, 20L, 21L, 22L, 6L, 10L, 11L, 13L, 14L, 18L, 23L, 12L,
21L, 22L), .Label = c("Automated Boost", "Competitors January",
"Dynamic Ad", "Focus campaign", "Marketing 0-25", "Marketing April",
"Marketing August", "Marketing December", "Marketing February",
"Marketing January", "Marketing July", "Marketing June",
"Marketing March", "Marketing May", "Upsell April", "Upsell August",
"Upsell Boost", "Upsell February", "Upsell January", "Upsell July",
"Upsell June", "Upsell March", "Upsell May"), class = "factor"),
n_obs = c(1L, 1L, 1L, 3L, 2L, 2L, 2L, 1L, 1L, 2L, 1L, 1L,
1L, 1L, 1L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L,
1L, 1L, 2L, 2L, 1L, 1L, 2L, 1L, 1L, 1L, 1L, 2L, 1L), Clicks = c(1364L,
0L, 4919L, 2597L, 2641L, 0L, 915L, 1104L, 63L, 0L, 242L,
206L, 3661L, 11L, 33L, 0L, 246L, 247L, 4L, 0L, 0L, 0L, 0L,0L, 0L, 2009L, 0L, 43L, 166L, 0L, 17L, 0L, 95L, 0L, 137L,
0L, 68L, 0L, 0L, 0L), Impressions = c(12409L, 0L, 58222L,
30115L, 47119L, 0L, 18817L, 17068L, 4175L, 0L, 4528L, 9842L,
98421L, 3L, 6042L, 0L, 7154L, 4253L, 202L, 0L, 0L, 0L, 0L,
150L, 0L, 17117L, 0L, 857L, 1821L, 0L, 1034L, 0L, 1258L,
0L, 948L, 0L, 2972L, 0L, 0L, 0L), Reach = c(12164L, 0L, 46142L,
25282L, 35142L, 0L, 14843L, 13533L, 3624L, 0L, 4528L, 8394L,
58401L, 3L, 5874L, 0L, 7013L, 3586L, 202L, 0L, 0L, 0L, 0L,
150L, 0L, 15349L, 0L, 819L, 1810L, 0L, 1014L, 0L, 938L, 0L,
948L, 0L, 2782L, 0L, 0L, 0L), Spend = c(1153.11, 0, 9663.16,
3202.1, 3393.49, 0, 1739.37, 1344.19, 501.88, 0, 299.22,
565.74, 11228.5, 0.15, 609.05, 0, 709.19, 478.98, 26.12,
0, 0, 0, 0, 22.25, 0, 2485.04, 0, 232.14, 256.1, 0, 129.6,
0, 157.25, 0, 122.62, 0, 717.32, 0, 0, 0.05), Purchase = c(140L,
163L, 104L, 33L, 22L, 17L, 11L, 13L, 2L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L), PurchaseValue = c(221595.22,
173029.62, 101894.91, 38974.63, 27336.71, 13247.8, 12461.66,
6186.55, 3754.31, 971.11, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0),
Date_minus_start_time = c(9, 13, 15, 26.3055555555556, 29,
5.5, 5.5, 19, 17, 16.5, 2, 27, 10, 0, 29, 26.5, 13, 15, 19,
43.9583333333333, 30, 5, 28, 8, 29.9583333333333, 21, 19,
3, 9, 17.5, 28, 10, 14, 30.4791666666667, 0, 11, 15, 18,
21, 5)), row.names = c(NA, -40L), groups = structure(list(
effective_status = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L
), .Label = c("ACTIVE", "PAUSED"), class = "factor"), Age = structure(2:8, .Label = c("13-17",
"18-24", "25-34", "35-44", "45-54", "55-64", "65+", "Unknown"
), class = "factor"), .rows = structure(list(c(8L, 11L, 12L
), c(1L, 13L, 14L, 15L), c(4L, 5L, 7L, 9L, 16L, 17L, 18L),
c(6L, 19L, 20L, 21L), c(3L, 22L, 23L, 24L, 25L, 26L,
27L, 28L, 29L, 30L), 31:37, c(2L, 10L, 38L, 39L, 40L)), ptype = integer(0), class = c("vctrs_list_of",
"vctrs_vctr", "list"))), row.names = c(NA, -7L), class = c("tbl_df",
"tbl", "data.frame"), .drop = TRUE), class = c("grouped_df",
"tbl_df", "tbl", "data.frame"))
a <- subset(helmes[sample(nrow(helmes), 100), ], !(name %in% c("Upsell Boost","Marketing 0-25","Dynamic Ad"))) %>%
group_by(effective_status,Age,name) %>%
summarise(
n_obs = n(),
Clicks = sum(Clicks,na.rm = TRUE),
Impressions = sum(Impressions,na.rm = TRUE),
Reach = sum(Reach,na.rm = TRUE),
Spend = sum(Spend,na.rm = TRUE),
Purchase = sum(Purchase,na.rm = TRUE),
PurchaseValue = sum(PurchaseValue,na.rm = TRUE),
Date_minus_start_time = mean(Date_minus_start_time,na.rm = TRUE)
) %>% arrange(desc(PurchaseValue))
res.pca <- prcomp(a[4:ncol(a)], scale = TRUE)
fviz_pca_ind(res.pca,
#col.ind = a$name, # color by groups
label = "none",
#geom = c("point","text"),
habillage = a$name, # color by groups
#palette = c("#00AFBB", "#FC4E07", "#2CA25F"),
addEllipses = TRUE, # Concentration ellipses
ellipse.type = "confidence",
legend.title = "Groups",
repel = TRUE )
Upvotes: 1
Views: 1017
Reputation: 10627
You can extract the computed PCA scores and then do your own ggplot:
library(tidyverse)
library(factoextra)
#> Welcome! Want to learn more? See two factoextra-related books at
df <- structure(list(
effective_status = structure(c(
1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L
), .Label = c("ACTIVE", "PAUSED"), class = "factor"),
Age = structure(c(
3L, 8L, 6L, 4L, 4L, 5L, 4L, 2L, 4L, 8L,
2L, 2L, 3L, 3L, 3L, 4L, 4L, 4L, 5L, 5L, 5L, 6L, 6L, 6L, 6L,
6L, 6L, 6L, 6L, 6L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 8L, 8L, 8L
), .Label = c(
"13-17", "18-24", "25-34", "35-44", "45-54",
"55-64", "65+", "Unknown"
), class = "factor"), name = structure(c(
19L,
23L, 18L, 22L, 9L, 6L, 6L, 9L, 15L, 14L, 12L, 14L, 12L, 13L,
15L, 10L, 11L, 20L, 9L, 13L, 19L, 6L, 9L, 10L, 13L, 14L,
19L, 20L, 21L, 22L, 6L, 10L, 11L, 13L, 14L, 18L, 23L, 12L,
21L, 22L
), .Label = c(
"Automated Boost", "Competitors January",
"Dynamic Ad", "Focus campaign", "Marketing 0-25", "Marketing April",
"Marketing August", "Marketing December", "Marketing February",
"Marketing January", "Marketing July", "Marketing June",
"Marketing March", "Marketing May", "Upsell April", "Upsell August",
"Upsell Boost", "Upsell February", "Upsell January", "Upsell July",
"Upsell June", "Upsell March", "Upsell May"
), class = "factor"),
n_obs = c(
1L, 1L, 1L, 3L, 2L, 2L, 2L, 1L, 1L, 2L, 1L, 1L,
1L, 1L, 1L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L,
1L, 1L, 2L, 2L, 1L, 1L, 2L, 1L, 1L, 1L, 1L, 2L, 1L
), Clicks = c(
1364L,
0L, 4919L, 2597L, 2641L, 0L, 915L, 1104L, 63L, 0L, 242L,
206L, 3661L, 11L, 33L, 0L, 246L, 247L, 4L, 0L, 0L, 0L, 0L, 0L, 0L, 2009L, 0L, 43L, 166L, 0L, 17L, 0L, 95L, 0L, 137L,
0L, 68L, 0L, 0L, 0L
), Impressions = c(
12409L, 0L, 58222L,
30115L, 47119L, 0L, 18817L, 17068L, 4175L, 0L, 4528L, 9842L,
98421L, 3L, 6042L, 0L, 7154L, 4253L, 202L, 0L, 0L, 0L, 0L,
150L, 0L, 17117L, 0L, 857L, 1821L, 0L, 1034L, 0L, 1258L,
0L, 948L, 0L, 2972L, 0L, 0L, 0L
), Reach = c(
12164L, 0L, 46142L,
25282L, 35142L, 0L, 14843L, 13533L, 3624L, 0L, 4528L, 8394L,
58401L, 3L, 5874L, 0L, 7013L, 3586L, 202L, 0L, 0L, 0L, 0L,
150L, 0L, 15349L, 0L, 819L, 1810L, 0L, 1014L, 0L, 938L, 0L,
948L, 0L, 2782L, 0L, 0L, 0L
), Spend = c(
1153.11, 0, 9663.16,
3202.1, 3393.49, 0, 1739.37, 1344.19, 501.88, 0, 299.22,
565.74, 11228.5, 0.15, 609.05, 0, 709.19, 478.98, 26.12,
0, 0, 0, 0, 22.25, 0, 2485.04, 0, 232.14, 256.1, 0, 129.6,
0, 157.25, 0, 122.62, 0, 717.32, 0, 0, 0.05
), Purchase = c(
140L,
163L, 104L, 33L, 22L, 17L, 11L, 13L, 2L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L
), PurchaseValue = c(
221595.22,
173029.62, 101894.91, 38974.63, 27336.71, 13247.8, 12461.66,
6186.55, 3754.31, 971.11, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
),
Date_minus_start_time = c(
9, 13, 15, 26.3055555555556, 29,
5.5, 5.5, 19, 17, 16.5, 2, 27, 10, 0, 29, 26.5, 13, 15, 19,
43.9583333333333, 30, 5, 28, 8, 29.9583333333333, 21, 19,
3, 9, 17.5, 28, 10, 14, 30.4791666666667, 0, 11, 15, 18,
21, 5
)
), row.names = c(NA, -40L), groups = structure(list(
effective_status = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = c("ACTIVE", "PAUSED"), class = "factor"), Age = structure(2:8, .Label = c(
"13-17",
"18-24", "25-34", "35-44", "45-54", "55-64", "65+", "Unknown"
), class = "factor"), .rows = structure(list(
c(8L, 11L, 12L), c(1L, 13L, 14L, 15L), c(4L, 5L, 7L, 9L, 16L, 17L, 18L),
c(6L, 19L, 20L, 21L), c(
3L, 22L, 23L, 24L, 25L, 26L,
27L, 28L, 29L, 30L
), 31:37, c(2L, 10L, 38L, 39L, 40L)
), ptype = integer(0), class = c(
"vctrs_list_of",
"vctrs_vctr", "list"
))
), row.names = c(NA, -7L), class = c(
"tbl_df",
"tbl", "data.frame"
), .drop = TRUE), class = c(
"grouped_df",
"tbl_df", "tbl", "data.frame"
))
df
#> # A tibble: 40 x 11
#> # Groups: effective_status, Age [7]
#> effective_status Age name n_obs Clicks Impressions Reach Spend Purchase
#> <fct> <fct> <fct> <int> <int> <int> <int> <dbl> <int>
#> 1 ACTIVE 25-34 Upsell… 1 1364 12409 12164 1153. 140
#> 2 ACTIVE Unkno… Upsell… 1 0 0 0 0 163
#> 3 ACTIVE 55-64 Upsell… 1 4919 58222 46142 9663. 104
#> 4 ACTIVE 35-44 Upsell… 3 2597 30115 25282 3202. 33
#> 5 ACTIVE 35-44 Market… 2 2641 47119 35142 3393. 22
#> 6 ACTIVE 45-54 Market… 2 0 0 0 0 17
#> 7 ACTIVE 35-44 Market… 2 915 18817 14843 1739. 11
#> 8 ACTIVE 18-24 Market… 1 1104 17068 13533 1344. 13
#> 9 ACTIVE 35-44 Upsell… 1 63 4175 3624 502. 2
#> 10 ACTIVE Unkno… Market… 2 0 0 0 0 0
#> # … with 30 more rows, and 2 more variables: PurchaseValue <dbl>,
#> # Date_minus_start_time <dbl>
res.pca <- prcomp(df[, c("Clicks", "Impressions", "Reach", "Spend", "Purchase", "PurchaseValue", "Date_minus_start_time")], scale = TRUE)
fviz_pca_ind(res.pca)$data %>%
bind_cols(df) %>%
ggplot(aes(x, y, color = name...9)) +
geom_label(aes(label = Age)) +
labs(color = "Name")
#> New names:
#> * name -> name...1
#> * name -> name...9
Created on 2021-09-17 by the reprex package (v2.0.1)
Upvotes: 1