feat(ds): r4

This commit is contained in:
2025-12-02 14:30:05 +03:00
parent 07a6cf6184
commit 98e89cfea6
5 changed files with 101 additions and 0 deletions

101
ds/25-1/r/4.R Normal file
View File

@ -0,0 +1,101 @@
setwd('/home/sek1ro/git/public/lab/ds/25-1/r')
load("./income_elec_state.Rdata")
df = income_elec_state
df$incomelog = log10(df$income)
remove(income_elec_state)
elbow_wss = function(df) {
max_k = 10
wss = numeric(max_k)
for (i in 1:max_k) {
res = kmeans(df[,1:2], centers = i)
wss[i] = res$tot.withinss
}
plot(1:max_k, wss, type="b")
wss_diff = diff(wss)
wss_ratio = wss_diff[-1] / wss_diff[-length(wss_diff)]
return(which.min(wss_ratio))
}
library(ggplot2)
plot_kmeans = function(data, k, log) {
res = kmeans(data, centers = k)
centers = as.data.frame(res$centers)
centers$cluster = as.factor(1:k)
data$cluster = as.factor(res$cluster)
data$state = rownames(df)
plt = ggplot() +
geom_point(
data = data,
aes(
x = income,
y = elec,
color = cluster
)
) +
geom_text(
data = data,
vjust = 1.5,
size = 2,
aes(
x = income,
y = elec,
label = state
)
) +
geom_point(
data = centers,
shape = 17,
size = 5,
aes(
x = income,
y = elec,
color = cluster,
)
) +
theme_minimal()
if (log) {
plt = plt + scale_x_log10() + scale_y_log10()
}
print(plt)
}
data = df[,c("income", "elec")]
datalog = df[,c("incomelog", "elec")]
k = elbow_wss(data)
klog = elbow_wss(datalog)
plot_kmeans(data, k, log=FALSE)
plot_kmeans(data, klog, log=TRUE)
library(maps)
res = kmeans(data, centers = k)
map_color = res$cluster[order(names(res$cluster))]
map("state", fill = TRUE, col = map_color)
Q1 = quantile(df$elec, 0.25)
Q3 = quantile(df$elec, 0.75)
IQR = Q3 - Q1
min = Q1 - 1.5 * IQR
max = Q3 + 1.5 * IQR
df = subset(df, df$elec > min & df$elec < max)
data = df[,c("income", "elec")]
datalog = df[,c("incomelog", "elec")]
k = elbow_wss(data)
klog = elbow_wss(datalog)
plot_kmeans(data, k, log=FALSE)
plot_kmeans(data, klog, log=TRUE)
library(ggdendro)
distance = dist(df, method = "euclidean")
clust = hclust(distance, method = "complete")
plot(ggdendrogram(clust))
cutree(clust, k = k)

Binary file not shown.