Dimension Reduction

PCA

library(FactoMineR)
data("decathlon")
summary(decathlon)
##       100m         Long.jump       Shot.put       High.jump    
##  Min.   :10.44   Min.   :6.61   Min.   :12.68   Min.   :1.850  
##  1st Qu.:10.85   1st Qu.:7.03   1st Qu.:13.88   1st Qu.:1.920  
##  Median :10.98   Median :7.30   Median :14.57   Median :1.950  
##  Mean   :11.00   Mean   :7.26   Mean   :14.48   Mean   :1.977  
##  3rd Qu.:11.14   3rd Qu.:7.48   3rd Qu.:14.97   3rd Qu.:2.040  
##  Max.   :11.64   Max.   :7.96   Max.   :16.36   Max.   :2.150  
##       400m        110m.hurdle        Discus        Pole.vault   
##  Min.   :46.81   Min.   :13.97   Min.   :37.92   Min.   :4.200  
##  1st Qu.:48.93   1st Qu.:14.21   1st Qu.:41.90   1st Qu.:4.500  
##  Median :49.40   Median :14.48   Median :44.41   Median :4.800  
##  Mean   :49.62   Mean   :14.61   Mean   :44.33   Mean   :4.762  
##  3rd Qu.:50.30   3rd Qu.:14.98   3rd Qu.:46.07   3rd Qu.:4.920  
##  Max.   :53.20   Max.   :15.67   Max.   :51.65   Max.   :5.400  
##     Javeline         1500m            Rank           Points    
##  Min.   :50.31   Min.   :262.1   Min.   : 1.00   Min.   :7313  
##  1st Qu.:55.27   1st Qu.:271.0   1st Qu.: 6.00   1st Qu.:7802  
##  Median :58.36   Median :278.1   Median :11.00   Median :8021  
##  Mean   :58.32   Mean   :279.0   Mean   :12.12   Mean   :8005  
##  3rd Qu.:60.89   3rd Qu.:285.1   3rd Qu.:18.00   3rd Qu.:8122  
##  Max.   :70.52   Max.   :317.0   Max.   :28.00   Max.   :8893  
##    Competition
##  Decastar:13  
##  OlympicG:28  
##               
##               
##               
## 
head(decathlon)
##          100m Long.jump Shot.put High.jump  400m 110m.hurdle Discus
## SEBRLE  11.04      7.58    14.83      2.07 49.81       14.69  43.75
## CLAY    10.76      7.40    14.26      1.86 49.37       14.05  50.72
## KARPOV  11.02      7.30    14.77      2.04 48.37       14.09  48.95
## BERNARD 11.02      7.23    14.25      1.92 48.93       14.99  40.87
## YURKOV  11.34      7.09    15.19      2.10 50.42       15.31  46.26
## WARNERS 11.11      7.60    14.31      1.98 48.68       14.23  41.10
##         Pole.vault Javeline 1500m Rank Points Competition
## SEBRLE        5.02    63.19 291.7    1   8217    Decastar
## CLAY          4.92    60.15 301.5    2   8122    Decastar
## KARPOV        4.92    50.31 300.2    3   8099    Decastar
## BERNARD       5.32    62.77 280.1    4   8067    Decastar
## YURKOV        4.72    63.44 276.4    5   8036    Decastar
## WARNERS       4.92    51.77 278.1    6   8030    Decastar
decathlonR <- decathlon
decathlonR[1:10] <- scale(decathlon[1:10], center = TRUE)
library("spe")
data("swissroll")
library("rgl")
swissroll <- swissroll[order(swissroll$z),]
plot3d(swissroll, col = plotrix::color.scale(1:nrow(swissroll),c(0,1,1),c(1,1,0),0))

You must enable Javascript to view this page properly.

library("tidyverse")
Clusters <- read_delim("donclassif.txt.gz", ";")
PlotClusters <- function(CoordPoints, Col = NULL) {
  if (is.null(Col)) {
    Col = sample(1:nrow(CoordPoints))
  }
  ggplot(data.frame(X = CoordPoints[, 1, drop = TRUE],
                    Y = CoordPoints[, 2, drop = TRUE],
                    Col = Col),
         aes(x = X, y = Y)) + 
    geom_point(aes(colour = Col), size = 2) + guides(color = FALSE) +
    coord_fixed()
}

PlotClusters(Clusters)

PlotDecathlon <- function(CoordPoints, Col = NULL) {
  if (is.null(Col)) {
    Col = decathlon$Points
  }
  ggplot(data = data.frame( X = CoordPoints[, 1, drop = TRUE],
                            Y = CoordPoints[, 2, drop = TRUE],
                            Col = Col),
         aes(x = X, y = Y)) + 
    geom_point(aes(colour = Col), size = 5) + geom_text(label = row.names(decathlon), vjust = -1.25) +
    scale_x_continuous(expand = c(.15,0)) + scale_y_continuous(expand = c(.1,0)) +
    guides(color = FALSE) +
    coord_fixed()
}

PlotSwissRoll <- function(CoordPoints, Col = NULL) {
  if (is.null(Col)) {
    Col = 1:nrow(swissroll)
  }
  ggplot(data = data.frame( X = CoordPoints[, 1, drop = TRUE],
                            Y = CoordPoints[, 2, drop = TRUE],
                            Col = Col),
         aes(x = X, y = Y)) + 
    geom_point(aes(colour = Col), size = 5) + guides(color = FALSE) +
    coord_fixed()
}
DecathlonPCA <- PCA(decathlon[1:10])

PlotDecathlon(DecathlonPCA$ind$coord)

DecathlonRPCA <- PCA(decathlonR[1:10])

PlotDecathlon(DecathlonRPCA$ind$coord)

SwissRollPCA <- PCA(swissroll)

PlotSwissRoll(SwissRollPCA$ind$coord)

ClustersPCA <- PCA(Clusters)

PlotClusters(ClustersPCA$ind$coord)

MFA

data(poison)
summary(poison)
##       Age             Time           Sick    Sex         Nausea  
##  Min.   : 4.00   Min.   : 0.00   Sick_n:17   F:28   Nausea_n:43  
##  1st Qu.: 6.00   1st Qu.: 0.00   Sick_y:38   M:27   Nausea_y:12  
##  Median : 8.00   Median :12.00                                   
##  Mean   :16.93   Mean   :10.16                                   
##  3rd Qu.:10.00   3rd Qu.:16.50                                   
##  Max.   :88.00   Max.   :22.00                                   
##     Vomiting   Abdominals     Fever          Diarrhae       Potato  
##  Vomit_n:33   Abdo_n:18   Fever_n:20   Diarrhea_n:20   Potato_n: 3  
##  Vomit_y:22   Abdo_y:37   Fever_y:35   Diarrhea_y:35   Potato_y:52  
##                                                                     
##                                                                     
##                                                                     
##                                                                     
##      Fish        Mayo      Courgette       Cheese         Icecream 
##  Fish_n: 1   Mayo_n:10   Courg_n: 5   Cheese_n: 7   Icecream_n: 4  
##  Fish_y:54   Mayo_y:45   Courg_y:50   Cheese_y:48   Icecream_y:51  
##                                                                    
##                                                                    
##                                                                    
## 
head(poison)
##   Age Time   Sick Sex   Nausea Vomiting Abdominals   Fever   Diarrhae
## 1   9   22 Sick_y   F Nausea_y  Vomit_n     Abdo_y Fever_y Diarrhea_y
## 2   5    0 Sick_n   F Nausea_n  Vomit_n     Abdo_n Fever_n Diarrhea_n
## 3   6   16 Sick_y   F Nausea_n  Vomit_y     Abdo_y Fever_y Diarrhea_y
## 4   9    0 Sick_n   F Nausea_n  Vomit_n     Abdo_n Fever_n Diarrhea_n
## 5   7   14 Sick_y   M Nausea_n  Vomit_y     Abdo_y Fever_y Diarrhea_y
## 6  72    9 Sick_y   M Nausea_n  Vomit_n     Abdo_y Fever_y Diarrhea_y
##     Potato   Fish   Mayo Courgette   Cheese   Icecream
## 1 Potato_y Fish_y Mayo_y   Courg_y Cheese_y Icecream_y
## 2 Potato_y Fish_y Mayo_y   Courg_y Cheese_n Icecream_y
## 3 Potato_y Fish_y Mayo_y   Courg_y Cheese_y Icecream_y
## 4 Potato_y Fish_y Mayo_n   Courg_y Cheese_y Icecream_y
## 5 Potato_y Fish_y Mayo_y   Courg_y Cheese_y Icecream_y
## 6 Potato_y Fish_n Mayo_y   Courg_y Cheese_y Icecream_y
PoisonMFA <- MFA(poison, group=c(2,2,5,6), type=c("s","n","n","n"),
    name.group=c("desc","desc2","symptom","eat"))

ICA

library("fastICA")
DecathlonICA <- fastICA(decathlon[1:10], n.comp = 2)
PlotDecathlon(DecathlonICA$S)

DecathlonRICA <- fastICA(decathlonR[1:10], n.comp = 2)
PlotDecathlon(DecathlonRICA$S)

SwissRollICA <- fastICA(swissroll, n.comp = 2)
PlotSwissRoll(SwissRollICA$S)

ClustersICA <- fastICA(Clusters, n.comp = 2)
PlotClusters(ClustersICA$S)

NMF

library("NMF")
DecathlonNMF <- nmf(decathlon[1:10], rank = 2)
PlotDecathlon(basis(DecathlonNMF))

k-PCA

library("kernlab")
DecathlonKPCA <- kpca(decathlon[1:10], features = 2)
PlotDecathlon(as.matrix(scale(decathlon[1:10], scale = FALSE)) %*% pcv(DecathlonKPCA))

DecathlonRKPCA <- kpca(decathlonR[1:10], features = 2)
PlotDecathlon(as.matrix(scale(decathlonR[1:10], scale = FALSE)) %*% pcv(DecathlonRKPCA))

SwissRollKPCA <- kpca(swissroll, features = 2)
PlotSwissRoll(as.matrix(scale(swissroll, scale = FALSE)) %*% pcv(SwissRollKPCA))

Deep Learning: PCA and autoencoders

#library(keras)
# pca_encoder <-  keras_model_sequential() %>%
#   layer_dense(units = 2, input_shape = c(10))
# pca_decoder <- keras_model_sequential() %>% layer_dense(units = 10, input_shape = c(2))
# pca_autoencoder <- keras_model_sequential() %>% pca_encoder() %>% pca_decoder()
# 
# pca_autoencoder %>% compile(loss = "mse", optimizer = "sgd",
#                         metric= "mse")
# 
# decathlonR100 <- decathlonR %>% slice(rep(seq_len(nrow(decathlonR)),100))
# 
# pca_hist <- pca_autoencoder %>% fit(x = as.matrix(decathlonR100[,1:10]),
#                                     y= as.matrix(decathlonR100[,1:10]),
#                                     epochs = 1000,
#                                     batch_size = 32,
#                                     verbose = 0)
# decathlon_pca_coords <- predict(pca_encoder, x = as.matrix(decathlonR[,1:10]))
# PlotDecathlon(decathlon_pca_coords)
# encoder <- keras_model_sequential() %>%
#   layer_dense(units = 4, activation = "relu", input_shape = c(10)) %>%
#   layer_dense(units = 2)
# decoder <- keras_model_sequential() %>%
#   layer_dense(units = 4, activation = "relu", input_shape = c(2)) %>%
#   layer_dense(units = 10)
# autoencoder <- keras_model_sequential() %>% encoder %>% decoder()
# 
# autoencoder %>% compile(loss = "mse", optimizer = "sgd",
#                         metric= "mse")
# 
# ae_hist <- autoencoder %>% fit(x = as.matrix(decathlonR100[,1:10]),
#                             y= as.matrix(decathlonR100[,1:10]),
#                             epochs = 5000,
#                             batch_size = 32,
#                             verbose = 0)
# decathlon_ae_coords <- predict(encoder, x = as.matrix(decathlonR[,1:10]))
# PlotDecathlon(decathlon_ae_coords)

MDS

DistDecathlon <- dist(decathlon[1:10])
DecathlonMDS <- cmdscale(DistDecathlon, k = 2)
PlotDecathlon(DecathlonMDS)

DistDecathlonR <- dist(decathlonR[1:10])
DecathlonRMDS <- cmdscale(DistDecathlonR, k = 2)
PlotDecathlon(DecathlonRMDS)

DistSwissRoll <- dist(swissroll)
SwissRollMDS <- cmdscale(DistSwissRoll, k = 2)
PlotSwissRoll(SwissRollMDS)

DistClusters <- dist(Clusters)
DistUClusters <- dist(unique(Clusters))
ClustersMDS <- cmdscale(DistClusters, k = 2)
PlotClusters(ClustersMDS)

ISOMAP

library("vegan")
DecathlonISOMAP <- isomap(DistDecathlon, ndim = 2, k = 5)
PlotDecathlon(DecathlonISOMAP$points)

DecathlonRISOMAP <- isomap(DistDecathlonR, ndim = 2, k = 5)
PlotDecathlon(DecathlonRISOMAP$points)

SwissRollISOMAP <- isomap(DistSwissRoll, ndim = 2, k = 10)
PlotSwissRoll(SwissRollISOMAP$points)

Random Projection

library("chemometrics")
DecathlonRandom <- RPvectors(2, 10)
PlotDecathlon(as.matrix(decathlon[1:10]) %*% DecathlonRandom)

PlotDecathlon(as.matrix(decathlonR[1:10]) %*% DecathlonRandom)

SwissRollRandom <- RPvectors(2, 3)
PlotSwissRoll(as.matrix(swissroll) %*% SwissRollRandom)

ClustersRandom <- RPvectors(2, 2)
PlotClusters(as.matrix(Clusters) %*% ClustersRandom)

LLE

library("lle")
DecathlonLLE <- lle(decathlon[1:10], m = 2, k = 10)
## finding neighbours
## calculating weights
## computing coordinates
PlotDecathlon(DecathlonLLE$Y)

DecathlonRLLE <- lle(decathlonR[1:10], m = 2, k = 10)
## finding neighbours
## calculating weights
## computing coordinates
PlotDecathlon(DecathlonRLLE$Y)

SwissRollLLE <- lle(swissroll, m = 2, k = 10)
## finding neighbours
## calculating weights
## computing coordinates
PlotSwissRoll(SwissRollLLE$Y)

ClustersLLE <- lle(Clusters, m = 2, k = 10)
## finding neighbours
## calculating weights
## computing coordinates
PlotClusters(ClustersLLE$Y)

t-SNE

library("Rtsne")

DecathlonTSNE <- Rtsne(as.matrix(decathlon[1:10]), perplexity = 10, pca = FALSE)
PlotDecathlon(DecathlonTSNE$Y)

DecathlonRTSNE <- Rtsne(as.matrix(decathlonR[1:10]), perplexity = 10, pca = FALSE)
PlotDecathlon(DecathlonRTSNE$Y)

SwissRollTSNE <- Rtsne(as.matrix(swissroll), perplexity = 30, pca = FALSE)
PlotSwissRoll(SwissRollTSNE$Y)

ClustersTSNE <- Rtsne(as.matrix(unique(Clusters)), perplexity = 30, pca = FALSE)
PlotSwissRoll(ClustersTSNE$Y, Col = 1:nrow(unique(Clusters)))

UMAP

library(uwot)

DecathlonUMAP <- umap(as.matrix(decathlon[1:10]))
PlotDecathlon(DecathlonUMAP)

DecathlonRUMAP <- umap(as.matrix(decathlonR[1:10]))
PlotDecathlon(DecathlonRUMAP)

SwissRollUMAP <- umap(as.matrix(swissroll))
PlotSwissRoll(SwissRollUMAP)

ClustersUMAP <- umap(as.matrix(unique(Clusters)))
PlotSwissRoll(ClustersUMAP, Col = 1:nrow(unique(Clusters)))

Graph

library("igraph")
DecathlonLap = embed_laplacian_matrix(graph.adjacency(as.matrix(exp(-DistDecathlon/3)), weighted = TRUE),2)
PlotDecathlon(DecathlonLap$X)

DecathlonRLap = embed_laplacian_matrix(graph.adjacency(as.matrix(exp(-DistDecathlonR/3)), weighted = TRUE),2)
PlotDecathlon(DecathlonRLap$X)

SwissRollLap = embed_laplacian_matrix(graph.adjacency(as.matrix(exp(-DistSwissRoll/3)), weighted = TRUE),2)
PlotSwissRoll(SwissRollLap$X)

Clustering

K-Means

K <- 4
K2 <- 10

DecathlonRKmeans <- kmeans(decathlonR[1:10], K)
PlotDecathlon(DecathlonRPCA$ind$coord, Col = factor(DecathlonRKmeans$cluster))

DecathlonRPCAKmeans <- kmeans(DecathlonRPCA$ind$coord[,1:2], K)
PlotDecathlon(DecathlonRPCA$ind$coord, Col = factor(DecathlonRPCAKmeans$cluster))

PlotDecathlon(DecathlonTSNE$Y, Col = factor(DecathlonRKmeans$cluster))

DecathlonTSNEKmeans <- kmeans(DecathlonTSNE$Y, K)
PlotDecathlon(DecathlonTSNE$Y, Col = factor(DecathlonTSNEKmeans$cluster))

PlotDecathlon(DecathlonRPCA$ind$coord, Col = factor(DecathlonTSNEKmeans$cluster))

ClustersKmeans <- kmeans(Clusters, K2)
PlotClusters(Clusters, Col = factor(ClustersKmeans$cluster))

ClustersTSNEKmeans <- kmeans(ClustersTSNE$Y, K2)
PlotClusters(unique(Clusters), Col = factor(ClustersTSNEKmeans$cluster))

PAM

DecathlonRPam <- pam(decathlonR[1:10], K)
PlotDecathlon(DecathlonRPCA$ind$coord, Col = factor(DecathlonRPam$cluster))

DecathlonRPCAPam <- pam(DecathlonRPCA$ind$coord[,1:2], K)
PlotDecathlon(DecathlonRPCA$ind$coord, Col = factor(DecathlonRPCAPam$cluster))

DecathlonTSNEPam <- pam(DecathlonTSNE$Y, K)
PlotDecathlon(DecathlonTSNE$Y, Col = factor(DecathlonTSNEPam$cluster))

ClustersPam <- kmeans(Clusters, K2)
PlotClusters(Clusters, Col = factor(ClustersPam$cluster))

ClustersTSNEPam <- kmeans(ClustersTSNE$Y, K2)
PlotClusters(unique(Clusters), Col = factor(ClustersTSNEPam$cluster))

GMM

library("Rmixmod")
DecathlonGMM <- mixmodCluster(decathlon[1:10], K)
PlotDecathlon(DecathlonPCA$ind$coord, Col = factor(DecathlonGMM@bestResult@partition))

DecathlonRGMM <- mixmodCluster(decathlonR[1:10], K)
PlotDecathlon(DecathlonRPCA$ind$coord, Col = factor(DecathlonRGMM@bestResult@partition))

ClustersGMM <- mixmodCluster(Clusters, K2)
PlotClusters(Clusters, Col = factor(ClustersGMM@bestResult@partition))

PLSA / LDA

library(topicmodels)
data("AssociatedPress")

k = 2
ap_lda <- LDA(AssociatedPress, k , control = list(seed = 1234))
ap_lda
## A LDA_VEM topic model with 2 topics.
terms(ap_lda,3)
##      Topic 1   Topic 2     
## [1,] "percent" "i"         
## [2,] "million" "president" 
## [3,] "new"     "government"

DBSCAN

library("fpc")

DecathlonDBScan <- dbscan(decathlon[1:10], 10)
PlotDecathlon(DecathlonPCA$ind$coord, Col = factor(DecathlonDBScan$cluster))

DecathlonRDBScan <- dbscan(decathlonR[1:10], 3)
PlotDecathlon(DecathlonRPCA$ind$coord, Col = factor(DecathlonRDBScan$cluster))

ClustersDBScan <- dbscan(Clusters, .2)
PlotClusters(Clusters, Col = factor(ClustersDBScan$cluster))

ClustersTSNEDBScan <- dbscan(ClustersTSNE$Y, 2)
PlotClusters(unique(Clusters), Col = factor(ClustersTSNEDBScan$cluster))

PdfCluster

library("pdfCluster")
ClustersPdf <- pdfCluster(Clusters)
PlotClusters(Clusters, Col = factor(ClustersPdf@clusters))

Mean Shift

library("LPCM")
ClustersMS <- ms(Clusters, h = .04)

PlotClusters(Clusters, Col = factor(ClustersMS$cluster.label))

Linkage

library("ggdendro")
library("fastcluster")
ClustersMLD <- hclust(DistClusters, method = "single")
ggdendrogram(ClustersMLD)

ClustersML <- cutree(ClustersMLD, k = K2)
PlotClusters(Clusters, Col = factor(ClustersML))

ClustersALD <- hclust(DistClusters, method = "average")
ggdendrogram(ClustersALD)

ClustersAL <- cutree(ClustersALD, k = K2)
PlotClusters(Clusters, Col = factor(ClustersAL))

ClustersMMLD <- hclust(DistClusters, method = "complete")
ggdendrogram(ClustersMMLD)

ClustersMML <- cutree(ClustersMMLD, k = K2)
PlotClusters(Clusters, Col = factor(ClustersMML))

WARD

ClustersWardD <- hclust(DistClusters, method = "ward.D2")
ggdendrogram(ClustersWardD)

ClustersWard <- cutree(ClustersWardD, k = K2)
PlotClusters(Clusters, Col = factor(ClustersWard))

DIANA

ClustersDianaD <- as.hclust(diana(Clusters))
ggdendrogram(ClustersDianaD)

ClustersDiana <- cutree(ClustersDianaD, k = K2)
PlotClusters(Clusters, Col = factor(ClustersDiana))

Affinity Propagation

library("apcluster")
ClustersAF <- apcluster(negDistMat(r=2), Clusters)
PlotClusters(Clusters, labels(ClustersAF))

CLARA

ClustersClara <- clara(Clusters, K2)
PlotClusters(Clusters, Col = factor(ClustersClara$clustering))

ClustersSNEClara <- clara(ClustersTSNE$Y, K2)
PlotClusters(unique(Clusters), Col = factor(ClustersSNEClara$clustering))