John Mayer
John Mayer

Reputation: 113

How add labels to the PCA plot from my dataframe

I have a dataset and want to run a PCA plot. In this plot the observations should be grouped in the same colour based on name column (habillage = a$name). Additionally, I want that single observation shows to which group it corresponds in terms of Age. I found that label = "none" does not show it, but if I write label = a$Age nothing changes. Finally, how avoid showing in the legens a black/white text, which duplicates habillage = a$name ?

libary(plyr)
library(dplyr)
library(factoextra)
df<-structure(list(effective_status = structure(c(1L, 1L, 1L, 1L, 
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
1L, 1L, 1L, 1L), .Label = c("ACTIVE", "PAUSED"), class = "factor"), 
    Age = structure(c(3L, 8L, 6L, 4L, 4L, 5L, 4L, 2L, 4L, 8L, 
    2L, 2L, 3L, 3L, 3L, 4L, 4L, 4L, 5L, 5L, 5L, 6L, 6L, 6L, 6L, 
    6L, 6L, 6L, 6L, 6L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 8L, 8L, 8L
    ), .Label = c("13-17", "18-24", "25-34", "35-44", "45-54", 
    "55-64", "65+", "Unknown"), class = "factor"), name = structure(c(19L, 
    23L, 18L, 22L, 9L, 6L, 6L, 9L, 15L, 14L, 12L, 14L, 12L, 13L, 
    15L, 10L, 11L, 20L, 9L, 13L, 19L, 6L, 9L, 10L, 13L, 14L, 
    19L, 20L, 21L, 22L, 6L, 10L, 11L, 13L, 14L, 18L, 23L, 12L, 
    21L, 22L), .Label = c("Automated Boost", "Competitors January", 
    "Dynamic Ad", "Focus campaign", "Marketing 0-25", "Marketing April", 
    "Marketing August", "Marketing December", "Marketing February", 
    "Marketing January", "Marketing July", "Marketing June", 
    "Marketing March", "Marketing May", "Upsell April", "Upsell August", 
    "Upsell Boost", "Upsell February", "Upsell January", "Upsell July", 
    "Upsell June", "Upsell March", "Upsell May"), class = "factor"), 
    n_obs = c(1L, 1L, 1L, 3L, 2L, 2L, 2L, 1L, 1L, 2L, 1L, 1L, 
    1L, 1L, 1L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 
    1L, 1L, 2L, 2L, 1L, 1L, 2L, 1L, 1L, 1L, 1L, 2L, 1L), Clicks = c(1364L, 
    0L, 4919L, 2597L, 2641L, 0L, 915L, 1104L, 63L, 0L, 242L, 
    206L, 3661L, 11L, 33L, 0L, 246L, 247L, 4L, 0L, 0L, 0L, 0L,0L, 0L, 2009L, 0L, 43L, 166L, 0L, 17L, 0L, 95L, 0L, 137L, 
    0L, 68L, 0L, 0L, 0L), Impressions = c(12409L, 0L, 58222L, 
    30115L, 47119L, 0L, 18817L, 17068L, 4175L, 0L, 4528L, 9842L, 
    98421L, 3L, 6042L, 0L, 7154L, 4253L, 202L, 0L, 0L, 0L, 0L, 
    150L, 0L, 17117L, 0L, 857L, 1821L, 0L, 1034L, 0L, 1258L, 
    0L, 948L, 0L, 2972L, 0L, 0L, 0L), Reach = c(12164L, 0L, 46142L, 
    25282L, 35142L, 0L, 14843L, 13533L, 3624L, 0L, 4528L, 8394L, 
    58401L, 3L, 5874L, 0L, 7013L, 3586L, 202L, 0L, 0L, 0L, 0L, 
    150L, 0L, 15349L, 0L, 819L, 1810L, 0L, 1014L, 0L, 938L, 0L, 
    948L, 0L, 2782L, 0L, 0L, 0L), Spend = c(1153.11, 0, 9663.16, 
    3202.1, 3393.49, 0, 1739.37, 1344.19, 501.88, 0, 299.22, 
    565.74, 11228.5, 0.15, 609.05, 0, 709.19, 478.98, 26.12, 
    0, 0, 0, 0, 22.25, 0, 2485.04, 0, 232.14, 256.1, 0, 129.6, 
    0, 157.25, 0, 122.62, 0, 717.32, 0, 0, 0.05), Purchase = c(140L, 
    163L, 104L, 33L, 22L, 17L, 11L, 13L, 2L, 0L, 0L, 0L, 0L, 
    0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 
    0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L), PurchaseValue = c(221595.22, 
    173029.62, 101894.91, 38974.63, 27336.71, 13247.8, 12461.66, 
    6186.55, 3754.31, 971.11, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), 
    Date_minus_start_time = c(9, 13, 15, 26.3055555555556, 29, 
    5.5, 5.5, 19, 17, 16.5, 2, 27, 10, 0, 29, 26.5, 13, 15, 19, 
    43.9583333333333, 30, 5, 28, 8, 29.9583333333333, 21, 19, 
    3, 9, 17.5, 28, 10, 14, 30.4791666666667, 0, 11, 15, 18, 
    21, 5)), row.names = c(NA, -40L), groups = structure(list(
    effective_status = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L
    ), .Label = c("ACTIVE", "PAUSED"), class = "factor"), Age = structure(2:8, .Label = c("13-17", 
"18-24", "25-34", "35-44", "45-54", "55-64", "65+", "Unknown"
    ), class = "factor"), .rows = structure(list(c(8L, 11L, 12L
    ), c(1L, 13L, 14L, 15L), c(4L, 5L, 7L, 9L, 16L, 17L, 18L), 
        c(6L, 19L, 20L, 21L), c(3L, 22L, 23L, 24L, 25L, 26L, 
        27L, 28L, 29L, 30L), 31:37, c(2L, 10L, 38L, 39L, 40L)), ptype = integer(0), class = c("vctrs_list_of", 
    "vctrs_vctr", "list"))), row.names = c(NA, -7L), class = c("tbl_df", 
"tbl", "data.frame"), .drop = TRUE), class = c("grouped_df", 
"tbl_df", "tbl", "data.frame"))


a <- subset(helmes[sample(nrow(helmes), 100), ], !(name %in% c("Upsell Boost","Marketing 0-25","Dynamic Ad"))) %>% 
group_by(effective_status,Age,name)  %>%  
summarise(
  n_obs = n(),
  Clicks = sum(Clicks,na.rm = TRUE),
  Impressions = sum(Impressions,na.rm = TRUE),
  Reach = sum(Reach,na.rm = TRUE),
  Spend = sum(Spend,na.rm = TRUE),
  Purchase = sum(Purchase,na.rm = TRUE),
  PurchaseValue = sum(PurchaseValue,na.rm = TRUE),
  Date_minus_start_time = mean(Date_minus_start_time,na.rm = TRUE)
)  %>% arrange(desc(PurchaseValue))


res.pca <- prcomp(a[4:ncol(a)],  scale = TRUE)

fviz_pca_ind(res.pca, 
                          #col.ind = a$name, # color by groups 
                          label = "none",
                          #geom = c("point","text"),
                          habillage = a$name, # color by groups
                          #palette = c("#00AFBB", "#FC4E07", "#2CA25F"), 
                          addEllipses = TRUE, # Concentration ellipses 
                          ellipse.type = "confidence", 
                          legend.title = "Groups", 
                          repel = TRUE )

Upvotes: 1

Views: 1017

Answers (1)

danlooo
danlooo

Reputation: 10627

You can extract the computed PCA scores and then do your own ggplot:

library(tidyverse)
library(factoextra)
#> Welcome! Want to learn more? See two factoextra-related books at

df <- structure(list(
  effective_status = structure(c(
    1L, 1L, 1L, 1L,
    1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
    1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
    1L, 1L, 1L, 1L
  ), .Label = c("ACTIVE", "PAUSED"), class = "factor"),
  Age = structure(c(
    3L, 8L, 6L, 4L, 4L, 5L, 4L, 2L, 4L, 8L,
    2L, 2L, 3L, 3L, 3L, 4L, 4L, 4L, 5L, 5L, 5L, 6L, 6L, 6L, 6L,
    6L, 6L, 6L, 6L, 6L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 8L, 8L, 8L
  ), .Label = c(
    "13-17", "18-24", "25-34", "35-44", "45-54",
    "55-64", "65+", "Unknown"
  ), class = "factor"), name = structure(c(
    19L,
    23L, 18L, 22L, 9L, 6L, 6L, 9L, 15L, 14L, 12L, 14L, 12L, 13L,
    15L, 10L, 11L, 20L, 9L, 13L, 19L, 6L, 9L, 10L, 13L, 14L,
    19L, 20L, 21L, 22L, 6L, 10L, 11L, 13L, 14L, 18L, 23L, 12L,
    21L, 22L
  ), .Label = c(
    "Automated Boost", "Competitors January",
    "Dynamic Ad", "Focus campaign", "Marketing 0-25", "Marketing April",
    "Marketing August", "Marketing December", "Marketing February",
    "Marketing January", "Marketing July", "Marketing June",
    "Marketing March", "Marketing May", "Upsell April", "Upsell August",
    "Upsell Boost", "Upsell February", "Upsell January", "Upsell July",
    "Upsell June", "Upsell March", "Upsell May"
  ), class = "factor"),
  n_obs = c(
    1L, 1L, 1L, 3L, 2L, 2L, 2L, 1L, 1L, 2L, 1L, 1L,
    1L, 1L, 1L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L,
    1L, 1L, 2L, 2L, 1L, 1L, 2L, 1L, 1L, 1L, 1L, 2L, 1L
  ), Clicks = c(
    1364L,
    0L, 4919L, 2597L, 2641L, 0L, 915L, 1104L, 63L, 0L, 242L,
    206L, 3661L, 11L, 33L, 0L, 246L, 247L, 4L, 0L, 0L, 0L, 0L, 0L, 0L, 2009L, 0L, 43L, 166L, 0L, 17L, 0L, 95L, 0L, 137L,
    0L, 68L, 0L, 0L, 0L
  ), Impressions = c(
    12409L, 0L, 58222L,
    30115L, 47119L, 0L, 18817L, 17068L, 4175L, 0L, 4528L, 9842L,
    98421L, 3L, 6042L, 0L, 7154L, 4253L, 202L, 0L, 0L, 0L, 0L,
    150L, 0L, 17117L, 0L, 857L, 1821L, 0L, 1034L, 0L, 1258L,
    0L, 948L, 0L, 2972L, 0L, 0L, 0L
  ), Reach = c(
    12164L, 0L, 46142L,
    25282L, 35142L, 0L, 14843L, 13533L, 3624L, 0L, 4528L, 8394L,
    58401L, 3L, 5874L, 0L, 7013L, 3586L, 202L, 0L, 0L, 0L, 0L,
    150L, 0L, 15349L, 0L, 819L, 1810L, 0L, 1014L, 0L, 938L, 0L,
    948L, 0L, 2782L, 0L, 0L, 0L
  ), Spend = c(
    1153.11, 0, 9663.16,
    3202.1, 3393.49, 0, 1739.37, 1344.19, 501.88, 0, 299.22,
    565.74, 11228.5, 0.15, 609.05, 0, 709.19, 478.98, 26.12,
    0, 0, 0, 0, 22.25, 0, 2485.04, 0, 232.14, 256.1, 0, 129.6,
    0, 157.25, 0, 122.62, 0, 717.32, 0, 0, 0.05
  ), Purchase = c(
    140L,
    163L, 104L, 33L, 22L, 17L, 11L, 13L, 2L, 0L, 0L, 0L, 0L,
    0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
    0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L
  ), PurchaseValue = c(
    221595.22,
    173029.62, 101894.91, 38974.63, 27336.71, 13247.8, 12461.66,
    6186.55, 3754.31, 971.11, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
  ),
  Date_minus_start_time = c(
    9, 13, 15, 26.3055555555556, 29,
    5.5, 5.5, 19, 17, 16.5, 2, 27, 10, 0, 29, 26.5, 13, 15, 19,
    43.9583333333333, 30, 5, 28, 8, 29.9583333333333, 21, 19,
    3, 9, 17.5, 28, 10, 14, 30.4791666666667, 0, 11, 15, 18,
    21, 5
  )
), row.names = c(NA, -40L), groups = structure(list(
  effective_status = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = c("ACTIVE", "PAUSED"), class = "factor"), Age = structure(2:8, .Label = c(
    "13-17",
    "18-24", "25-34", "35-44", "45-54", "55-64", "65+", "Unknown"
  ), class = "factor"), .rows = structure(list(
    c(8L, 11L, 12L), c(1L, 13L, 14L, 15L), c(4L, 5L, 7L, 9L, 16L, 17L, 18L),
    c(6L, 19L, 20L, 21L), c(
      3L, 22L, 23L, 24L, 25L, 26L,
      27L, 28L, 29L, 30L
    ), 31:37, c(2L, 10L, 38L, 39L, 40L)
  ), ptype = integer(0), class = c(
    "vctrs_list_of",
    "vctrs_vctr", "list"
  ))
), row.names = c(NA, -7L), class = c(
  "tbl_df",
  "tbl", "data.frame"
), .drop = TRUE), class = c(
  "grouped_df",
  "tbl_df", "tbl", "data.frame"
))

df
#> # A tibble: 40 x 11
#> # Groups:   effective_status, Age [7]
#>    effective_status Age    name    n_obs Clicks Impressions Reach Spend Purchase
#>    <fct>            <fct>  <fct>   <int>  <int>       <int> <int> <dbl>    <int>
#>  1 ACTIVE           25-34  Upsell…     1   1364       12409 12164 1153.      140
#>  2 ACTIVE           Unkno… Upsell…     1      0           0     0    0       163
#>  3 ACTIVE           55-64  Upsell…     1   4919       58222 46142 9663.      104
#>  4 ACTIVE           35-44  Upsell…     3   2597       30115 25282 3202.       33
#>  5 ACTIVE           35-44  Market…     2   2641       47119 35142 3393.       22
#>  6 ACTIVE           45-54  Market…     2      0           0     0    0        17
#>  7 ACTIVE           35-44  Market…     2    915       18817 14843 1739.       11
#>  8 ACTIVE           18-24  Market…     1   1104       17068 13533 1344.       13
#>  9 ACTIVE           35-44  Upsell…     1     63        4175  3624  502.        2
#> 10 ACTIVE           Unkno… Market…     2      0           0     0    0         0
#> # … with 30 more rows, and 2 more variables: PurchaseValue <dbl>,
#> #   Date_minus_start_time <dbl>

res.pca <- prcomp(df[, c("Clicks", "Impressions", "Reach", "Spend", "Purchase", "PurchaseValue", "Date_minus_start_time")], scale = TRUE)

fviz_pca_ind(res.pca)$data %>%
  bind_cols(df) %>%
  ggplot(aes(x, y, color = name...9)) +
  geom_label(aes(label = Age)) +
  labs(color = "Name")
#> New names:
#> * name -> name...1
#> * name -> name...9

Created on 2021-09-17 by the reprex package (v2.0.1)

Upvotes: 1

Related Questions