139 lines
2.7 KiB
R
139 lines
2.7 KiB
R
setwd('/home/sek1ro/git/public/lab/ds/25-1/r')
|
|
load("./income_elec_state.Rdata")
|
|
df = income_elec_state
|
|
df$incomelog = log10(df$income)
|
|
remove(income_elec_state)
|
|
|
|
elbow_wss = function(df) {
|
|
max_k = 10
|
|
wss = numeric(max_k)
|
|
|
|
for (i in 1:max_k) {
|
|
res = kmeans(df[,1:2], centers = i)
|
|
wss[i] = res$tot.withinss
|
|
}
|
|
|
|
plot(1:max_k, wss, type="b")
|
|
|
|
wss_diff = diff(wss)
|
|
wss_ratio = wss_diff[-1] / wss_diff[-length(wss_diff)]
|
|
return(which.min(wss_ratio))
|
|
}
|
|
|
|
library(ggplot2)
|
|
|
|
plot_kmeans = function(data, k, log) {
|
|
res = kmeans(data, centers = k)
|
|
centers = as.data.frame(res$centers)
|
|
centers$cluster = as.factor(1:k)
|
|
data$cluster = as.factor(res$cluster)
|
|
data$state = rownames(df)
|
|
|
|
plt = ggplot() +
|
|
geom_point(
|
|
data = data,
|
|
aes(
|
|
x = income,
|
|
y = elec,
|
|
color = cluster
|
|
)
|
|
) +
|
|
geom_text(
|
|
data = data,
|
|
vjust = 1.5,
|
|
size = 2,
|
|
aes(
|
|
x = income,
|
|
y = elec,
|
|
label = state
|
|
)
|
|
) +
|
|
geom_point(
|
|
data = centers,
|
|
shape = 17,
|
|
size = 5,
|
|
aes(
|
|
x = income,
|
|
y = elec,
|
|
color = cluster,
|
|
)
|
|
) +
|
|
theme_minimal()
|
|
|
|
if (log) {
|
|
plt = plt + scale_x_log10() + scale_y_log10()
|
|
}
|
|
|
|
print(plt)
|
|
}
|
|
|
|
data = df[,c("income", "elec")]
|
|
datalog = df[,c("incomelog", "elec")]
|
|
k = elbow_wss(data)
|
|
klog = elbow_wss(datalog)
|
|
plot_kmeans(data, k, log=FALSE)
|
|
plot_kmeans(data, klog, log=TRUE)
|
|
|
|
|
|
library(maps)
|
|
res = kmeans(data, centers = k)
|
|
map_color = res$cluster[order(names(res$cluster))]
|
|
map("state", fill = TRUE, col = map_color)
|
|
|
|
Q1 = quantile(df$elec, 0.25)
|
|
Q3 = quantile(df$elec, 0.75)
|
|
IQR = Q3 - Q1
|
|
min = Q1 - 1.5 * IQR
|
|
max = Q3 + 1.5 * IQR
|
|
df = subset(df, df$elec > min & df$elec < max)
|
|
|
|
data = df[,c("income", "elec")]
|
|
datalog = df[,c("incomelog", "elec")]
|
|
k = elbow_wss(data)
|
|
klog = elbow_wss(datalog)
|
|
plot_kmeans(data, k, log=FALSE)
|
|
plot_kmeans(data, klog, log=TRUE)
|
|
|
|
library(ggdendro)
|
|
|
|
|
|
plot_hclust = function(df, linkage, k) {
|
|
data = df[,c("income", "elec")]
|
|
distance = dist(data, method = "euclidean")
|
|
clust = hclust(distance, method = linkage)
|
|
data$cluster = as.factor(cutree(clust, k = k))
|
|
data$state = rownames(df)
|
|
print(cutree(clust, k = k))
|
|
print(data)
|
|
plt = ggplot() +
|
|
geom_point(
|
|
data = data,
|
|
aes(
|
|
x = income,
|
|
y = elec,
|
|
color = cluster
|
|
)
|
|
) +
|
|
geom_text(
|
|
data = data,
|
|
vjust = 1.5,
|
|
size = 2,
|
|
aes(
|
|
x = income,
|
|
y = elec,
|
|
label = state
|
|
)
|
|
)
|
|
theme_minimal()
|
|
print(plt)
|
|
}
|
|
|
|
plot_hclust(data, "average", 5)
|
|
|
|
distance = dist(data, method = "euclidean")
|
|
|
|
clust = hclust(distance, method = "single")
|
|
plot(ggdendrogram(clust))
|
|
|
|
cutree(clust, k = 3)
|