From 98e89cfea6eff132ad3d8add214b7b130eeba232 Mon Sep 17 00:00:00 2001 From: SEK1RO Date: Tue, 2 Dec 2025 14:30:05 +0300 Subject: [PATCH] feat(ds): r4 --- ds/25-1/{r2 => r}/2.R | 0 ds/25-1/r/4.R | 101 ++++++++++++++++++++++++++++++ ds/25-1/r/income_elec_state.Rdata | Bin 0 -> 1155 bytes ds/25-1/{r2 => r}/r2.Rproj | 0 ds/25-1/{r2 => r}/zipIncome.txt | 0 5 files changed, 101 insertions(+) rename ds/25-1/{r2 => r}/2.R (100%) create mode 100644 ds/25-1/r/4.R create mode 100644 ds/25-1/r/income_elec_state.Rdata rename ds/25-1/{r2 => r}/r2.Rproj (100%) rename ds/25-1/{r2 => r}/zipIncome.txt (100%) diff --git a/ds/25-1/r2/2.R b/ds/25-1/r/2.R similarity index 100% rename from ds/25-1/r2/2.R rename to ds/25-1/r/2.R diff --git a/ds/25-1/r/4.R b/ds/25-1/r/4.R new file mode 100644 index 0000000..e5bf2db --- /dev/null +++ b/ds/25-1/r/4.R @@ -0,0 +1,101 @@ +setwd('/home/sek1ro/git/public/lab/ds/25-1/r') +load("./income_elec_state.Rdata") +df = income_elec_state +df$incomelog = log10(df$income) +remove(income_elec_state) + +elbow_wss = function(df) { + max_k = 10 + wss = numeric(max_k) + + for (i in 1:max_k) { + res = kmeans(df[,1:2], centers = i) + wss[i] = res$tot.withinss + } + + plot(1:max_k, wss, type="b") + + wss_diff = diff(wss) + wss_ratio = wss_diff[-1] / wss_diff[-length(wss_diff)] + return(which.min(wss_ratio)) +} + +library(ggplot2) + +plot_kmeans = function(data, k, log) { + res = kmeans(data, centers = k) + centers = as.data.frame(res$centers) + centers$cluster = as.factor(1:k) + data$cluster = as.factor(res$cluster) + data$state = rownames(df) + + plt = ggplot() + + geom_point( + data = data, + aes( + x = income, + y = elec, + color = cluster + ) + ) + + geom_text( + data = data, + vjust = 1.5, + size = 2, + aes( + x = income, + y = elec, + label = state + ) + ) + + geom_point( + data = centers, + shape = 17, + size = 5, + aes( + x = income, + y = elec, + color = cluster, + ) + ) + + theme_minimal() + + if (log) { + plt = plt + scale_x_log10() + scale_y_log10() + } + + print(plt) +} + +data = df[,c("income", "elec")] +datalog = df[,c("incomelog", "elec")] +k = elbow_wss(data) +klog = elbow_wss(datalog) +plot_kmeans(data, k, log=FALSE) +plot_kmeans(data, klog, log=TRUE) + + +library(maps) +res = kmeans(data, centers = k) +map_color = res$cluster[order(names(res$cluster))] +map("state", fill = TRUE, col = map_color) + +Q1 = quantile(df$elec, 0.25) +Q3 = quantile(df$elec, 0.75) +IQR = Q3 - Q1 +min = Q1 - 1.5 * IQR +max = Q3 + 1.5 * IQR +df = subset(df, df$elec > min & df$elec < max) + +data = df[,c("income", "elec")] +datalog = df[,c("incomelog", "elec")] +k = elbow_wss(data) +klog = elbow_wss(datalog) +plot_kmeans(data, k, log=FALSE) +plot_kmeans(data, klog, log=TRUE) + +library(ggdendro) +distance = dist(df, method = "euclidean") +clust = hclust(distance, method = "complete") +plot(ggdendrogram(clust)) +cutree(clust, k = k) diff --git a/ds/25-1/r/income_elec_state.Rdata b/ds/25-1/r/income_elec_state.Rdata new file mode 100644 index 0000000000000000000000000000000000000000..c3d710a4e58e3c67bf11001a9f2ada0540501fc6 GIT binary patch literal 1155 zcmZ9KUucbS7{@>7yzizhLTj`d}sSTtzCTI@ArM4_j#W0^ZUK2SVLKHLopC0 zJXudq(hutq!ly_)b!}^|ttH=-Z_ekMIy$qRdGMzFFMvtna=11KM}NV~jf4toptl=V z?Sa!}gyzhKD_w-VyU@J|u5O3h^Py+Aw8}qa;mLmJ`Uo#d;OjGZzXKkvhk-tL{0C0$ zgKw{(zl6}lL-6GY+_?dl_Y#`-8D4LKH#_0p0{A{4Js0l((2PbnmV?3NaCotL{M4-J zg#6=h?m7%@g$rBYY;a@&O? zw>|pi4`2HDvRrev!*t|^3mSmbl_C7)>Pb7#Hoa3 zYf>&2#7tE>6}O70sl*DGO;$TqpLQx^+1jX!MG{V>8=OimacY@$j#|Hr#WLnTsCN}h ztDLH~&OwF6OeJljxK#w^Dli98b6;mFWpiuHOJKVt?VPLa{Ngq(VzHRb$e8lbNs fo@3^!%CZ&aAY$&5<}Ypzf~4zG8`EF+cNqTwF*cb? literal 0 HcmV?d00001 diff --git a/ds/25-1/r2/r2.Rproj b/ds/25-1/r/r2.Rproj similarity index 100% rename from ds/25-1/r2/r2.Rproj rename to ds/25-1/r/r2.Rproj diff --git a/ds/25-1/r2/zipIncome.txt b/ds/25-1/r/zipIncome.txt similarity index 100% rename from ds/25-1/r2/zipIncome.txt rename to ds/25-1/r/zipIncome.txt