Cargamos librerías:
# install.packages("cluster")
# install.packages("factoextra")
# install.packages("dendextend")
library(tidyverse)
library(cluster) # clustering algorithms
library(factoextra) # clustering visualization
library(dendextend) # for comparing two dendrograms
Usamos la misma notación que para clasificación:
X_ <- iris %>% select(-Species)
y <- iris$Species
X_
X <- as.tibble(scale(X_))
X
¿Por qué tenemos que escalar los datos?
Observemos la fórmula para la distancia Euclidiana: la distancia entre dos puntos \((x_1, x_2, \ldots, x_d)\) y \((y_1, y_2, \ldots, y_d)\):
\[d(x, y) = \sqrt{\sum_{i=1}^d (x_i - y_i)^2}\]
Veamos todos los atributos usando un scatterplot matrix:
data("iris")
pairs(X, col=y)
km <- kmeans(X, centers = 3, nstart = 25)
km$cluster
## [1] 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
## [36] 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 3 3 3 1 1 1 3 1 1 1 1 1 1 1 1 3 1 1 1 1
## [71] 3 1 1 1 1 3 3 3 1 1 1 1 1 1 1 3 3 1 1 1 1 1 1 1 1 1 1 1 1 1 3 1 3 3 3
## [106] 3 1 3 3 3 3 3 3 1 1 3 3 3 3 1 3 1 3 1 3 3 1 3 3 3 3 3 3 1 1 3 3 3 1 3
## [141] 3 3 1 3 3 3 1 3 3 1
str(km)
## List of 9
## $ cluster : int [1:150] 2 2 2 2 2 2 2 2 2 2 ...
## $ centers : num [1:3, 1:4] -0.0501 -1.0112 1.1322 -0.8804 0.8504 ...
## ..- attr(*, "dimnames")=List of 2
## .. ..$ : chr [1:3] "1" "2" "3"
## .. ..$ : chr [1:4] "Sepal.Length" "Sepal.Width" "Petal.Length" "Petal.Width"
## $ totss : num 596
## $ withinss : num [1:3] 44.1 47.4 47.5
## $ tot.withinss: num 139
## $ betweenss : num 457
## $ size : int [1:3] 53 50 47
## $ iter : int 3
## $ ifault : int 0
## - attr(*, "class")= chr "kmeans"
km
## K-means clustering with 3 clusters of sizes 53, 50, 47
##
## Cluster means:
## Sepal.Length Sepal.Width Petal.Length Petal.Width
## 1 -0.05005221 -0.88042696 0.3465767 0.2805873
## 2 -1.01119138 0.85041372 -1.3006301 -1.2507035
## 3 1.13217737 0.08812645 0.9928284 1.0141287
##
## Clustering vector:
## [1] 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
## [36] 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 3 3 3 1 1 1 3 1 1 1 1 1 1 1 1 3 1 1 1 1
## [71] 3 1 1 1 1 3 3 3 1 1 1 1 1 1 1 3 3 1 1 1 1 1 1 1 1 1 1 1 1 1 3 1 3 3 3
## [106] 3 1 3 3 3 3 3 3 1 1 3 3 3 3 1 3 1 3 1 3 3 1 3 3 3 3 3 3 1 1 3 3 3 1 3
## [141] 3 3 1 3 3 3 1 3 3 1
##
## Within cluster sum of squares by cluster:
## [1] 44.08754 47.35062 47.45019
## (between_SS / total_SS = 76.7 %)
##
## Available components:
##
## [1] "cluster" "centers" "totss" "withinss"
## [5] "tot.withinss" "betweenss" "size" "iter"
## [9] "ifault"
table(y, km$cluster)
##
## y 1 2 3
## setosa 0 50 0
## versicolor 39 0 11
## virginica 14 0 36
purity_score <- function(clusters, classes) {
sum(apply(table(classes, clusters), 2, max)) / length(clusters)
}
purity_score(km$cluster, as.integer(y))
## [1] 0.8333333
# ambas asignaciones coinciden
purity_score(km$cluster, km$cluster)
## [1] 1
# la segunda asignacion es aleatoria
purity_score(km$cluster, sample(1:3, 150, replace = TRUE))
## [1] 0.4266667
entropy_score <- function(cluster, truth) {
k <- max(cluster, truth)
cluster <- factor(cluster, levels = 1:k)
truth <- factor(truth, levels = 1:k)
m <- length(cluster)
mi <- table(cluster)
cnts <- split(truth, cluster)
cnts <- sapply(cnts, FUN = function(n) table(n))
p <- sweep(cnts, 1, rowSums(cnts), "/")
p[is.nan(p)] <- 0
e <- -p * log(p, 2)
sum(rowSums(e, na.rm = TRUE) * mi/m)
}
entropy_score(km$cluster, as.integer(y))
## [1] 0.5214304
# ambas asignaciones son iguales
entropy_score(km$cluster, km$cluster)
## [1] 0
# la segunda asignacion es aleatoria
entropy_score(km$cluster, sample(1:3, 150, replace = TRUE))
## [1] 1.57535
Evaluamos las características de los clusters: ¿son cohesivos? ¿están lejos de otros clusters?
Una métrica que mide separación y cohesión a la vez es el Silhouette score.
# equivalente a kmeans, pero compatible con los metodos fviz
km.2 <- eclust(X, "kmeans", k = 3, nstart = 25)
fviz_silhouette(km.2)
## cluster size ave.sil.width
## 1 1 50 0.64
## 2 2 47 0.35
## 3 3 53 0.39
fviz_dist(dist(X))
set.seed(123)
wss <- function(k) {
return(kmeans(X, k, nstart = 10)$tot.withinss)
}
k.values <- 1:15
wss_values <- map_dbl(k.values, wss)
tibble(k=k.values, w=wss_values) %>%
ggplot(aes(x=k, y=w)) +
geom_point() +
geom_line() +
labs(x="Numero de clusters", y="Suma del error cuadratico interno")
O bien
fviz_nbclust(X, kmeans, method = "wss")
Pregunta: Si tuviera que determinar un valor elevado para el número de clusters (por ejemplo, cientos o miles de clusters), ¿cómo determinaría eficientemente el número de clusters usando el método del codo?
avg_sil <- function(k) {
km.res <- kmeans(X, centers = k, nstart = 25)
ss <- silhouette(km.res$cluster, dist(X))
mean(ss[, 3])
}
k.values <- 2:15
avg_sil_values <- map_dbl(k.values, avg_sil)
tibble(k = k.values, s = avg_sil_values) %>%
ggplot(aes(x = k, y = s)) +
geom_point() +
geom_line() +
labs(x = "Numero de clusters", y = "Promedio del Silhouette score")
fviz_nbclust(X, kmeans, method = "silhouette")
data <- read_csv('https://users.dcc.uchile.cl/~mquezada/diplomado-2018/accidentes-2010-2011.csv')
## Parsed with column specification:
## cols(
## Muestra = col_character(),
## `Descripcion Muestra` = col_character(),
## Año = col_integer(),
## Atropello = col_number(),
## Caída = col_number(),
## Colisión = col_number(),
## Choque = col_number(),
## Volcadura = col_number(),
## Otros = col_number(),
## Muertos = col_number(),
## `Lesionados Graves` = col_number(),
## `Lesionados Menos Graves` = col_number(),
## `Lesionados Leves` = col_number(),
## `(Incorporar Otra Columna Si es Necesario)` = col_character()
## )
data.cont <- data %>%
filter(Muestra == 'Comunal', Año == 2011) %>%
select(-c(Muestra, Año, `(Incorporar Otra Columna Si es Necesario)`))
labels <- data.cont$`Descripcion Muestra`
accidentes <- as.tibble(scale(data.cont %>% select(-(`Descripcion Muestra`))))
accidentes
set.seed(123)
wss <- function(k) {
return(kmeans(accidentes, k, nstart = 10)$tot.withinss)
}
k.values <- 1:15
wss_values <- map_dbl(k.values, wss)
tibble(k=k.values, w=wss_values) %>%
ggplot(aes(x=k, y=w)) +
geom_point() +
geom_line() +
labs(x="Numero de clusters", y="Suma del error cuadratico interno")
avg_sil <- function(k) {
km.res <- kmeans(X, centers = k, nstart = 25)
ss <- silhouette(km.res$cluster, dist(X))
mean(ss[, 3])
}
k.values <- 2:15
avg_sil_values <- map_dbl(k.values, avg_sil)
tibble(k = k.values, s = avg_sil_values) %>%
ggplot(aes(x = k, y = s)) +
geom_point() +
geom_line() +
labs(x = "Numero de clusters", y = "Promedio del Silhouette score")
km.3 <- eclust(accidentes, "kmeans", k = 2, nstart = 25)
km.3
## K-means clustering with 2 clusters of sizes 23, 319
##
## Cluster means:
## Atropello Caída Colisión Choque Volcadura Otros
## 1 2.9567953 2.3847384 2.9840601 3.0765139 2.2318985 1.653877
## 2 -0.2131859 -0.1719404 -0.2151517 -0.2218176 -0.1609206 -0.119245
## Muertos Lesionados Graves Lesionados Menos Graves Lesionados Leves
## 1 2.1996067 2.9405916 2.5418137 3.0941112
## 2 -0.1585923 -0.2120176 -0.1832656 -0.2230864
##
## Clustering vector:
## [1] 1 2 2 2 2 2 2 2 2 2 2 1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 1 2 2 2 2 2
## [36] 2 2 2 2 2 2 2 2 2 1 2 2 2 1 2 2 1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
## [71] 2 2 2 2 2 2 2 2 2 2 2 1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
## [106] 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 1 2 2 2 2 2 2 2 2 1 2 2 2
## [141] 2 2 2 2 1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 1 2 2 2 2 2 2 2 2 2 2 2
## [176] 2 2 1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 1 2 2 2 2 2 2 2 2 2 2 2
## [211] 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 1 2 2
## [246] 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
## [281] 2 1 2 2 2 2 2 2 2 2 1 2 2 2 2 2 2 2 2 1 2 2 2 1 2 2 2 2 1 1 2 2 2 1 2
## [316] 2 2 2 2 2 2 2 1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
##
## Within cluster sum of squares by cluster:
## [1] 773.1913 909.8509
## (between_SS / total_SS = 50.6 %)
##
## Available components:
##
## [1] "cluster" "centers" "totss" "withinss"
## [5] "tot.withinss" "betweenss" "size" "iter"
## [9] "ifault" "clust_plot" "silinfo" "nbclust"
## [13] "data"
fviz_silhouette(km.3)
## cluster size ave.sil.width
## 1 1 23 0.22
## 2 2 319 0.80
data.cont %>%
filter(km.3$cluster == 1)
data.cont %>%
filter(km.3$cluster == 2)
hc <- eclust(accidentes, "hclust", k = 2, hc_metric = "euclidean", hc_method = "ward.D2")
fviz_dend(hc, show_labels = FALSE)
fviz_cluster(hc)
fviz_silhouette(hc)
## cluster size ave.sil.width
## 1 1 17 0.28
## 2 2 325 0.80
data.cont %>%
filter(hc$cluster == 1)
Vamos a filtrar el primer cluster y a examinarlo por separado:
labels.c1 <- labels[hc$cluster == 1]
c1 <- accidentes %>%
filter(hc$cluster == 1)
rownames(c1) <- labels.c1
## Warning: Setting row names on a tibble is deprecated.
hc.1 <- eclust(c1, "hclust", k = 2, hc_metric = "euclidean", hc_method = "ward.D2")
fviz_dend(hc.1, show_labels = TRUE)
c2 <- accidentes %>%
filter(hc$cluster == 2)
hc.2 <- eclust(c2, "hclust", k = 5, hc_metric = "euclidean", hc_method = "ward.D2")
fviz_dend(hc.2, show_labels = FALSE)
data.cont.c2 <- data.cont %>%
filter(hc$cluster == 2)
data.cont.c2 %>%
filter(hc.2$cluster == 1)
data.cont.c2 <- data.cont %>%
filter(hc$cluster == 2)
data.cont.c2 %>%
filter(hc.2$cluster == 2)
data.cont.c2 %>%
filter(hc.2$cluster == 3)
data.cont.c2 %>%
filter(hc.2$cluster == 4)
data.cont.c2 %>%
filter(hc.2$cluster == 5)