diff --git a/ds/25-1/r2/2.R b/ds/25-1/r/2.R similarity index 100% rename from ds/25-1/r2/2.R rename to ds/25-1/r/2.R diff --git a/ds/25-1/r/4.R b/ds/25-1/r/4.R new file mode 100644 index 0000000..e5bf2db --- /dev/null +++ b/ds/25-1/r/4.R @@ -0,0 +1,101 @@ +setwd('/home/sek1ro/git/public/lab/ds/25-1/r') +load("./income_elec_state.Rdata") +df = income_elec_state +df$incomelog = log10(df$income) +remove(income_elec_state) + +elbow_wss = function(df) { + max_k = 10 + wss = numeric(max_k) + + for (i in 1:max_k) { + res = kmeans(df[,1:2], centers = i) + wss[i] = res$tot.withinss + } + + plot(1:max_k, wss, type="b") + + wss_diff = diff(wss) + wss_ratio = wss_diff[-1] / wss_diff[-length(wss_diff)] + return(which.min(wss_ratio)) +} + +library(ggplot2) + +plot_kmeans = function(data, k, log) { + res = kmeans(data, centers = k) + centers = as.data.frame(res$centers) + centers$cluster = as.factor(1:k) + data$cluster = as.factor(res$cluster) + data$state = rownames(df) + + plt = ggplot() + + geom_point( + data = data, + aes( + x = income, + y = elec, + color = cluster + ) + ) + + geom_text( + data = data, + vjust = 1.5, + size = 2, + aes( + x = income, + y = elec, + label = state + ) + ) + + geom_point( + data = centers, + shape = 17, + size = 5, + aes( + x = income, + y = elec, + color = cluster, + ) + ) + + theme_minimal() + + if (log) { + plt = plt + scale_x_log10() + scale_y_log10() + } + + print(plt) +} + +data = df[,c("income", "elec")] +datalog = df[,c("incomelog", "elec")] +k = elbow_wss(data) +klog = elbow_wss(datalog) +plot_kmeans(data, k, log=FALSE) +plot_kmeans(data, klog, log=TRUE) + + +library(maps) +res = kmeans(data, centers = k) +map_color = res$cluster[order(names(res$cluster))] +map("state", fill = TRUE, col = map_color) + +Q1 = quantile(df$elec, 0.25) +Q3 = quantile(df$elec, 0.75) +IQR = Q3 - Q1 +min = Q1 - 1.5 * IQR +max = Q3 + 1.5 * IQR +df = subset(df, df$elec > min & df$elec < max) + +data = df[,c("income", "elec")] +datalog = df[,c("incomelog", "elec")] +k = elbow_wss(data) +klog = elbow_wss(datalog) +plot_kmeans(data, k, log=FALSE) +plot_kmeans(data, klog, log=TRUE) + +library(ggdendro) +distance = dist(df, method = "euclidean") +clust = hclust(distance, method = "complete") +plot(ggdendrogram(clust)) +cutree(clust, k = k) diff --git a/ds/25-1/r/income_elec_state.Rdata b/ds/25-1/r/income_elec_state.Rdata new file mode 100644 index 0000000..c3d710a Binary files /dev/null and b/ds/25-1/r/income_elec_state.Rdata differ diff --git a/ds/25-1/r2/r2.Rproj b/ds/25-1/r/r2.Rproj similarity index 100% rename from ds/25-1/r2/r2.Rproj rename to ds/25-1/r/r2.Rproj diff --git a/ds/25-1/r2/zipIncome.txt b/ds/25-1/r/zipIncome.txt similarity index 100% rename from ds/25-1/r2/zipIncome.txt rename to ds/25-1/r/zipIncome.txt