Reputation: 473
In this example (https://gist.github.com/thigm85/8424654) LDA was examined vs. PCA on iris dataset. How can I also do LDA on the PCA results (PCA-LDA) ?
Code:
require(MASS)
require(ggplot2)
require(scales)
require(gridExtra)
pca <- prcomp(iris[,-5],
center = TRUE,
scale. = TRUE)
prop.pca = pca$sdev^2/sum(pca$sdev^2)
lda <- lda(Species ~ .,
iris,
prior = c(1,1,1)/3)
prop.lda = lda$svd^2/sum(lda$svd^2)
plda <- predict(object = lda,
newdata = iris)
dataset = data.frame(species = iris[,"Species"],
pca = pca$x, lda = plda$x)
p1 <- ggplot(dataset) + geom_point(aes(lda.LD1, lda.LD2, colour = species, shape = species), size = 2.5) +
labs(x = paste("LD1 (", percent(prop.lda[1]), ")", sep=""),
y = paste("LD2 (", percent(prop.lda[2]), ")", sep=""))
p2 <- ggplot(dataset) + geom_point(aes(pca.PC1, pca.PC2, colour = species, shape = species), size = 2.5) +
labs(x = paste("PC1 (", percent(prop.pca[1]), ")", sep=""),
y = paste("PC2 (", percent(prop.pca[2]), ")", sep=""))
grid.arrange(p1, p2)
Upvotes: 1
Views: 2447
Reputation: 707
Here is another way to do PCA-LDA a.k.a. DAPC in R, if one has to find the best number of retained principal components for LDA (as you typically have to for large datasets with many predictors).
This solution uses the tidymodels package for hyperparameter tuning, applied to the Glass dataset.
library(mlbench)
data(Glass)
library(tidymodels)
library(discrim) # with tidymodels wrapper for MASS::lda
mod <- discrim::discrim_linear(mode = "classification", engine = "MASS")
mod %>% translate() # this is what the discrim::discrim_linear wrapper will do
#Linear Discriminant Model Specification (classification)
#
#Computational engine: MASS
#
#Model fit template:
#MASS::lda(formula = missing_arg(), data = missing_arg())
# specify missing_arg() [formula, data] using a tidymodels recipe
rec <- recipe(formula = Type ~ ., data = Glass) %>%
step_normalize(all_numeric_predictors()) %>%
step_pca(all_numeric_predictors(), num_comp = tune())
# tuning grid with hyperparameters in columns
# column name(s) must match tune() above
tuneGrid <- expand.grid(num_comp = 1:ncol(rec$template))
# control tune_grid() process below
trControl <- control_grid(verbose = TRUE, allow_par = FALSE)
wflow <- workflow(preprocessor = rec, spec = mod)
set.seed(8482)
folds <- vfold_cv(rec$template, v = 5, repeats = 1, strata = "Type", pool = 0.1)
# takes a while to process, decrease v and repeats above to speed up
fit_train <- wflow %>%
tune_grid(resamples = folds,
grid = tuneGrid,
metrics = metric_set(accuracy),# or specify multiple metrics
control = trControl)
met_train <- fit_train %>% collect_metrics()
ggplot(met_train, aes(x = num_comp, y = mean)) +
geom_line(color = "#3E4A89FF", size = 2, alpha = 0.6) +
scale_x_continuous(breaks = 1:ncol(rec$template)) +
facet_wrap(~.metric) +
theme_bw()
7 PC appear to be enough for classification:
# show best 5
fit_train %>% show_best(metric = "accuracy")
## A tibble: 5 × 7
# num_comp .metric .estimator mean n std_err .config
# <int> <chr> <chr> <dbl> <int> <dbl> <chr>
#1 9 accuracy multiclass 0.626 5 0.0207 Preprocessor09_Model1
#2 10 accuracy multiclass 0.626 5 0.0207 Preprocessor10_Model1
#3 7 accuracy multiclass 0.626 5 0.0220 Preprocessor07_Model1
#4 8 accuracy multiclass 0.598 5 0.0225 Preprocessor08_Model1
#5 6 accuracy multiclass 0.579 5 0.0221 Preprocessor06_Model1
# select best, e.g. by applying the one-standard-error-rule
(bestTune <- fit_train %>% select_by_one_std_err(num_comp, metric = "accuracy"))
## A tibble: 1 × 9
# num_comp .metric .estimator mean n std_err .config .best .bound
# <int> <chr> <chr> <dbl> <int> <dbl> <chr> <dbl> <dbl>
#1 7 accuracy multiclass 0.626 5 0.0220 Preprocessor07_Model1 0.626 0.605
# finalize workflow
wflow_final <- wflow %>% finalize_workflow(bestTune)
# verify that the workflow was updated correctly
wflow$pre$actions$recipe$recipe$steps[[2]]$num_comp # should be tune()
# tune()
wflow_final$pre$actions$recipe$recipe$steps[[2]]$num_comp # should be 7
#7
fit_final <- wflow_final %>% fit(Glass)
class(fit_final$fit$fit$fit) # here is the MASS::lda object
#lda
# use predict() to get predicted class, posterior probs, and LDA scores
Glass.PCALDA <- tibble(Glass,
predict(fit_final, new_data = Glass, type = "class"), # predicted class
predict(fit_final, new_data = Glass, type = "prob"), # posterior prob. for classes
as_tibble(predict(fit_final, new_data = Glass, type = "raw")$x)) # LD scores
# verify that tidymodels did it right
Own.PCALDA <- lda(prcomp(Glass[,-10], center = T, scale. = T)$x[,1:7],
grouping = Glass[,10])
Own.PCALDA$x <- predict(Own.PCALDA)$x
all.equal(as_tibble(Own.PCALDA$x),
Glass.PCALDA %>% dplyr::select(starts_with("LD"))) # it's the same!
#TRUE
# plot
ggplot(Glass.PCALDA, aes(x = LD1, y = LD2)) +
geom_point(aes(color = Type, shape = .pred_class)) +
theme_bw() +
ggtitle("PCA-LDA (DAPC) on Glass dataset, using 7 PC")
Own.LDA <- lda(scale(Glass[,-10], center = T, scale = T), grouping = Glass[,10])
Own.LDA$predict <- predict(Own.LDA)
Glass.LDA <- tibble(Glass,
.pred_class = Own.LDA$predict$class,
as_tibble(Own.LDA$predict$posterior) %>% rename_all(list(function(x){paste0(".pred_", x)})),
as_tibble(Own.LDA$predict$x))
# plot
ggplot(Glass.LDA, aes(x = LD1, y = LD2)) +
geom_point(aes(color = Type, shape = .pred_class)) +
theme_bw() +
ggtitle("LDA on Glass dataset")
# compare model accuracies
accuracy(Glass.PCALDA, truth = Type, estimate = .pred_class) # 66.8%
accuracy(Glass.LDA, truth = Type, estimate = .pred_class) # 67.3%
For this small dataset, dimensionality reduction via PCA does not lead to better classification results. However, the PCA-LDA procedure allows for application of LDA on very large datasets with a high degree of multi-collinearity (many highly correlated predictors). The PCA step removes multi-collinearity (which can be a problem for LDA), and the cross-validation procedure shown above identifies the optimal dimensionality reduction (to 7 PC) for classification.
Upvotes: 1
Reputation: 707
For PCA-LDA (also called Discriminant Analysis of Principal Components, DAPC) it is important to find the best compromise between underfitting and overfitting of data. This is tricky especially for high-dimensional data (many variables = columns). In such cases, it becomes a machine learning problem to solve with cross-validation. The only hyperparameter to be tuned is the number of retained PC.
The adegenet
package offers a handy implementation of PCA-LDA (mainly with genetic data in mind, but it can be used for any type of data). The xvalDapc
function fits n.rep
LDA models on bootstrap subsets of the data (training.set
argument) for all indicated numbers of retained PC (n.pca
argument). Using a specified performance metric (result
argument), it finds the n.pca
with the lowest associated mean squared error (MSE) across all repetitions.
Tune n.pca for DAPC
library(adegenet)
# center and scale data to unit variance
iris.s <- scale(iris[,1:4])
# tune PCA-LDA / DAPC hyperparameter (n.pca)
d.dapc <- xvalDapc(x = iris.s, grp = iris[,5],
scale = FALSE, center = TRUE, result = "groupMean",
training.set = 2/3, n.pca = 1:3, n.rep = 100)
(n.pca <- as.numeric(d.dapc$`Number of PCs Achieving Lowest MSE`))
Note that scaling makes sense here since petals and sepals are much longer than wide. An analysis on unscaled data would result in PC components heavily loaded by petal and sepal length (due to their larger variance), even though petal width is an important distinguishing feature (see plot below).
The final DAPC model d.dapc$DAPC
is fit on the complete training data, using the optimal n.pca
, which was found to be 3.
Once the optimal n.pca
has been identified using cross-validation, a quick way to re-fit the model with some additional results (loadings of original variables $var.load
) is by the dapc
function (used by xvalDapc
):
dd.dapc <- dapc(x = iris.s, grp = iris[,5], n.pca = n.pca, n.da = 2,
scale = FALSE, center = TRUE, var.loadings = TRUE)
The dapc
object class contains various slots, of which the most important are:
$ind.var
)$var.load
)$var.contr
)$eig
)Compute %explained variance
> (expl_var <- dd.dapc$eig / sum(dd.dapc$eig))
[1] 0.991500694 0.008499306
Compare dapc
and prcomp
/ lda
iris.pca <- prcomp(iris.s, center = TRUE, scale. = FALSE)
iris.lda <- lda(x = iris.pca$x[,1:n.pca], grouping = iris[,5])
scores.own <- predict(iris.lda)$x
# check conformity
scores.dapc <- dd.dapc$ind.coord
all.equal(unname(scores.own), unname(scores.dapc)) # it's the same!
[1] TRUE
Visualization
par(mfrow=c(2,1))
scatter(dd.dapc)
loadingplot(dd.dapc$var.contr, axis = 1, main = "Loadings on LD1")
par(mfrow=c(1,1))
Upvotes: 0
Reputation: 46888
Usually you do PCA-LDA to reduce the dimensions of your data before performing PCA. Ideally you decide the first k components to keep from the PCA. In your example with iris, we take the first 2 components, otherwise it will look pretty much the same as without PCA.
Try it like this:
pcdata = data.frame(pca$x[,1:2],Species=iris$Species)
pc_lda <- lda(Species ~ .,data=pcdata , prior = c(1,1,1)/3)
prop_pc_lda = pc_lda$svd^2/sum(pc_lda$svd^2)
pc_plda <- predict(object = pc_lda,newdata = pcdata)
dataset = data.frame(species = iris[,"Species"],pc_plda$x)
p3 <- ggplot(dataset) + geom_point(aes(LD1, LD2, colour = species, shape = species), size = 2.5) +
labs(x = paste("LD1 (", percent(prop_pc_lda[1]), ")", sep=""),
y = paste("LD2 (", percent(prop_pc_lda[2]), ")", sep=""))
print(p3)
You don't see much of a difference here because the first 2 components of the PCA captures most of the variance in the iris dataset.
Upvotes: 1
Reputation: 76402
This is very simple, apply lda
to the principal components coordinates returned by princomp
in the question's code.
pca_lda <- lda(pca$x, grouping = iris$Species)
Now it is a matter of using the methods predict
for each object type to get the classifications' accuracies.
pred_pca_lda <- predict(lda0, predict(pca, iris))
accuracy_lda <- mean(plda$class == iris$Species)
accuracy_pca_lda <- mean(pred_pca_lda$class == iris$Species)
accuracy_lda
#[1] 0.98
accuracy_pca_lda
#[1] 0.98
Upvotes: 0