feat(ds): r4

2025-12-02 14:30:05 +03:00
parent 07a6cf6184
commit 98e89cfea6
5 changed files with 101 additions and 0 deletions
--- a/ds/25-1/r/2.R
+++ b/ds/25-1/r/2.R
@ -0,0 +1,49 @@
+df = read.csv("./zipIncome.txt", sep = "|")
+colnames(df) <- c("zipCode", "income")
+
+summary(df)
+mean(df$income)
+if (any(is.na(df[,2]))) {
+  na = which(is.na(df[,2]))
+  df = df[-na,]
+}
+
+mean(df$income)
+median(df$income, na.rm=TRUE)
+
+plot(x=df$income, y=df$zipCode, xlab="income", ylab="zipCode")
+df$incomelog = log10(df$income)
+hist(df$incomelog, breaks=80)
+print(min_incomelog <- log10(7e3))
+print(max_incomelog <- log10(2e5))
+print(avg_incomelog <- median(df$incomelog))
+
+df <- subset(df, 7e3 < df$income & df$income < 2e5)
+hist(df$incomelog, breaks=80)
+summary(df)
+
+boxplot(incomelog ~ zipCode, data=df, main="boxplot", xlab="zipCode", ylab="incomelog")
+
+library(ggplot2)
+
+ggplot(df, aes(x=zipCode, y=income, color=zipCode)) +
+  geom_point(
+    position = position_jitter(width = 0.2),
+    alpha = 0.2,
+  ) +
+  geom_boxplot(
+    alpha = 0.5,
+    outlier.shape = NA,
+    width = 0.6,
+    fill = "white",
+    color = "black"
+  ) +
+  scale_y_log10(
+    breaks = c(1e4, 25e3, 5e4, 1e5, 2e5, 5e5)
+  ) +
+  labs(
+    title = "Распределение доходов по почтовым индексам",
+    subtitle = "Scatter plot jitter",
+  ) +
+  theme_minimal()
+  
--- a/ds/25-1/r/4.R
+++ b/ds/25-1/r/4.R
@ -0,0 +1,101 @@
+setwd('/home/sek1ro/git/public/lab/ds/25-1/r')
+load("./income_elec_state.Rdata")
+df = income_elec_state
+df$incomelog = log10(df$income)
+remove(income_elec_state)
+
+elbow_wss = function(df) {
+  max_k = 10
+  wss = numeric(max_k)
+  
+  for (i in 1:max_k) {
+    res = kmeans(df[,1:2], centers = i)
+    wss[i] = res$tot.withinss 
+  }
+  
+  plot(1:max_k, wss, type="b")
+  
+  wss_diff = diff(wss)
+  wss_ratio = wss_diff[-1] / wss_diff[-length(wss_diff)]
+  return(which.min(wss_ratio))
+}
+
+library(ggplot2)
+
+plot_kmeans = function(data, k, log) {
+  res = kmeans(data, centers = k)
+  centers = as.data.frame(res$centers)
+  centers$cluster = as.factor(1:k)
+  data$cluster = as.factor(res$cluster)
+  data$state = rownames(df)
+  
+  plt = ggplot() +
+    geom_point(
+      data = data,
+      aes(
+        x = income,
+        y = elec, 
+        color = cluster
+      )
+    ) +
+    geom_text(
+      data = data,
+      vjust = 1.5,
+      size = 2,
+      aes(
+        x = income,
+        y = elec,
+        label = state
+      )
+    ) +
+    geom_point(
+      data = centers,
+      shape = 17,
+      size = 5,
+      aes(
+        x = income,
+        y = elec,
+        color = cluster,
+      )
+    ) +
+    theme_minimal()
+  
+  if (log) {
+    plt = plt + scale_x_log10() + scale_y_log10()
+  }
+  
+  print(plt)
+}
+
+data = df[,c("income", "elec")]
+datalog = df[,c("incomelog", "elec")]
+k = elbow_wss(data)
+klog = elbow_wss(datalog)
+plot_kmeans(data, k, log=FALSE)
+plot_kmeans(data, klog, log=TRUE)
+
+
+library(maps)
+res = kmeans(data, centers = k)
+map_color = res$cluster[order(names(res$cluster))]
+map("state", fill = TRUE, col = map_color)
+
+Q1 = quantile(df$elec, 0.25)
+Q3 = quantile(df$elec, 0.75)
+IQR = Q3 - Q1
+min = Q1 - 1.5 * IQR
+max = Q3 + 1.5 * IQR
+df = subset(df,  df$elec > min & df$elec < max)
+
+data = df[,c("income", "elec")]
+datalog = df[,c("incomelog", "elec")]
+k = elbow_wss(data)
+klog = elbow_wss(datalog)
+plot_kmeans(data, k, log=FALSE)
+plot_kmeans(data, klog, log=TRUE)
+
+library(ggdendro)
+distance = dist(df, method = "euclidean")
+clust = hclust(distance, method = "complete")
+plot(ggdendrogram(clust))
+cutree(clust, k = k)
--- a/ds/25-1/r/income_elec_state.Rdata
+++ b/ds/25-1/r/income_elec_state.Rdata
--- a/ds/25-1/r/r2.Rproj
+++ b/ds/25-1/r/r2.Rproj
@ -0,0 +1,13 @@
+Version: 1.0
+
+RestoreWorkspace: Default
+SaveWorkspace: Default
+AlwaysSaveHistory: Default
+
+EnableCodeIndexing: Yes
+UseSpacesForTab: Yes
+NumSpacesForTab: 2
+Encoding: UTF-8
+
+RnwWeave: Sweave
+LaTeX: pdfLaTeX
--- a/ds/25-1/r/zipIncome.txt
+++ b/ds/25-1/r/zipIncome.txt