setwd('/home/sek1ro/git/public/lab/ds/25-1/r') load("./income_elec_state.Rdata") df = income_elec_state df$incomelog = log10(df$income) remove(income_elec_state) elbow_wss = function(df) { max_k = 10 wss = numeric(max_k) for (i in 1:max_k) { res = kmeans(df[,1:2], centers = i) wss[i] = res$tot.withinss } plot(1:max_k, wss, type="b") wss_diff = diff(wss) wss_ratio = wss_diff[-1] / wss_diff[-length(wss_diff)] return(which.min(wss_ratio)) } library(ggplot2) plot_kmeans = function(data, k, log) { res = kmeans(data, centers = k) centers = as.data.frame(res$centers) centers$cluster = as.factor(1:k) data$cluster = as.factor(res$cluster) data$state = rownames(df) plt = ggplot() + geom_point( data = data, aes( x = income, y = elec, color = cluster ) ) + geom_text( data = data, vjust = 1.5, size = 2, aes( x = income, y = elec, label = state ) ) + geom_point( data = centers, shape = 17, size = 5, aes( x = income, y = elec, color = cluster, ) ) + theme_minimal() if (log) { plt = plt + scale_x_log10() + scale_y_log10() } print(plt) } data = df[,c("income", "elec")] datalog = df[,c("incomelog", "elec")] k = elbow_wss(data) klog = elbow_wss(datalog) plot_kmeans(data, k, log=FALSE) plot_kmeans(data, klog, log=TRUE) library(maps) res = kmeans(data, centers = k) map_color = res$cluster[order(names(res$cluster))] map("state", fill = TRUE, col = map_color) Q1 = quantile(df$elec, 0.25) Q3 = quantile(df$elec, 0.75) IQR = Q3 - Q1 min = Q1 - 1.5 * IQR max = Q3 + 1.5 * IQR df = subset(df, df$elec > min & df$elec < max) data = df[,c("income", "elec")] datalog = df[,c("incomelog", "elec")] k = elbow_wss(data) klog = elbow_wss(datalog) plot_kmeans(data, k, log=FALSE) plot_kmeans(data, klog, log=TRUE) library(ggdendro) distance = dist(df, method = "euclidean") clust = hclust(distance, method = "complete") plot(ggdendrogram(clust)) cutree(clust, k = k)