feat(ds): r4
This commit is contained in:
49
ds/25-1/r/2.R
Normal file
49
ds/25-1/r/2.R
Normal file
@ -0,0 +1,49 @@
|
||||
df = read.csv("./zipIncome.txt", sep = "|")
|
||||
colnames(df) <- c("zipCode", "income")
|
||||
|
||||
summary(df)
|
||||
mean(df$income)
|
||||
if (any(is.na(df[,2]))) {
|
||||
na = which(is.na(df[,2]))
|
||||
df = df[-na,]
|
||||
}
|
||||
|
||||
mean(df$income)
|
||||
median(df$income, na.rm=TRUE)
|
||||
|
||||
plot(x=df$income, y=df$zipCode, xlab="income", ylab="zipCode")
|
||||
df$incomelog = log10(df$income)
|
||||
hist(df$incomelog, breaks=80)
|
||||
print(min_incomelog <- log10(7e3))
|
||||
print(max_incomelog <- log10(2e5))
|
||||
print(avg_incomelog <- median(df$incomelog))
|
||||
|
||||
df <- subset(df, 7e3 < df$income & df$income < 2e5)
|
||||
hist(df$incomelog, breaks=80)
|
||||
summary(df)
|
||||
|
||||
boxplot(incomelog ~ zipCode, data=df, main="boxplot", xlab="zipCode", ylab="incomelog")
|
||||
|
||||
library(ggplot2)
|
||||
|
||||
ggplot(df, aes(x=zipCode, y=income, color=zipCode)) +
|
||||
geom_point(
|
||||
position = position_jitter(width = 0.2),
|
||||
alpha = 0.2,
|
||||
) +
|
||||
geom_boxplot(
|
||||
alpha = 0.5,
|
||||
outlier.shape = NA,
|
||||
width = 0.6,
|
||||
fill = "white",
|
||||
color = "black"
|
||||
) +
|
||||
scale_y_log10(
|
||||
breaks = c(1e4, 25e3, 5e4, 1e5, 2e5, 5e5)
|
||||
) +
|
||||
labs(
|
||||
title = "Распределение доходов по почтовым индексам",
|
||||
subtitle = "Scatter plot jitter",
|
||||
) +
|
||||
theme_minimal()
|
||||
|
||||
101
ds/25-1/r/4.R
Normal file
101
ds/25-1/r/4.R
Normal file
@ -0,0 +1,101 @@
|
||||
setwd('/home/sek1ro/git/public/lab/ds/25-1/r')
|
||||
load("./income_elec_state.Rdata")
|
||||
df = income_elec_state
|
||||
df$incomelog = log10(df$income)
|
||||
remove(income_elec_state)
|
||||
|
||||
elbow_wss = function(df) {
|
||||
max_k = 10
|
||||
wss = numeric(max_k)
|
||||
|
||||
for (i in 1:max_k) {
|
||||
res = kmeans(df[,1:2], centers = i)
|
||||
wss[i] = res$tot.withinss
|
||||
}
|
||||
|
||||
plot(1:max_k, wss, type="b")
|
||||
|
||||
wss_diff = diff(wss)
|
||||
wss_ratio = wss_diff[-1] / wss_diff[-length(wss_diff)]
|
||||
return(which.min(wss_ratio))
|
||||
}
|
||||
|
||||
library(ggplot2)
|
||||
|
||||
plot_kmeans = function(data, k, log) {
|
||||
res = kmeans(data, centers = k)
|
||||
centers = as.data.frame(res$centers)
|
||||
centers$cluster = as.factor(1:k)
|
||||
data$cluster = as.factor(res$cluster)
|
||||
data$state = rownames(df)
|
||||
|
||||
plt = ggplot() +
|
||||
geom_point(
|
||||
data = data,
|
||||
aes(
|
||||
x = income,
|
||||
y = elec,
|
||||
color = cluster
|
||||
)
|
||||
) +
|
||||
geom_text(
|
||||
data = data,
|
||||
vjust = 1.5,
|
||||
size = 2,
|
||||
aes(
|
||||
x = income,
|
||||
y = elec,
|
||||
label = state
|
||||
)
|
||||
) +
|
||||
geom_point(
|
||||
data = centers,
|
||||
shape = 17,
|
||||
size = 5,
|
||||
aes(
|
||||
x = income,
|
||||
y = elec,
|
||||
color = cluster,
|
||||
)
|
||||
) +
|
||||
theme_minimal()
|
||||
|
||||
if (log) {
|
||||
plt = plt + scale_x_log10() + scale_y_log10()
|
||||
}
|
||||
|
||||
print(plt)
|
||||
}
|
||||
|
||||
data = df[,c("income", "elec")]
|
||||
datalog = df[,c("incomelog", "elec")]
|
||||
k = elbow_wss(data)
|
||||
klog = elbow_wss(datalog)
|
||||
plot_kmeans(data, k, log=FALSE)
|
||||
plot_kmeans(data, klog, log=TRUE)
|
||||
|
||||
|
||||
library(maps)
|
||||
res = kmeans(data, centers = k)
|
||||
map_color = res$cluster[order(names(res$cluster))]
|
||||
map("state", fill = TRUE, col = map_color)
|
||||
|
||||
Q1 = quantile(df$elec, 0.25)
|
||||
Q3 = quantile(df$elec, 0.75)
|
||||
IQR = Q3 - Q1
|
||||
min = Q1 - 1.5 * IQR
|
||||
max = Q3 + 1.5 * IQR
|
||||
df = subset(df, df$elec > min & df$elec < max)
|
||||
|
||||
data = df[,c("income", "elec")]
|
||||
datalog = df[,c("incomelog", "elec")]
|
||||
k = elbow_wss(data)
|
||||
klog = elbow_wss(datalog)
|
||||
plot_kmeans(data, k, log=FALSE)
|
||||
plot_kmeans(data, klog, log=TRUE)
|
||||
|
||||
library(ggdendro)
|
||||
distance = dist(df, method = "euclidean")
|
||||
clust = hclust(distance, method = "complete")
|
||||
plot(ggdendrogram(clust))
|
||||
cutree(clust, k = k)
|
||||
BIN
ds/25-1/r/income_elec_state.Rdata
Normal file
BIN
ds/25-1/r/income_elec_state.Rdata
Normal file
Binary file not shown.
13
ds/25-1/r/r2.Rproj
Normal file
13
ds/25-1/r/r2.Rproj
Normal file
@ -0,0 +1,13 @@
|
||||
Version: 1.0
|
||||
|
||||
RestoreWorkspace: Default
|
||||
SaveWorkspace: Default
|
||||
AlwaysSaveHistory: Default
|
||||
|
||||
EnableCodeIndexing: Yes
|
||||
UseSpacesForTab: Yes
|
||||
NumSpacesForTab: 2
|
||||
Encoding: UTF-8
|
||||
|
||||
RnwWeave: Sweave
|
||||
LaTeX: pdfLaTeX
|
||||
32040
ds/25-1/r/zipIncome.txt
Normal file
32040
ds/25-1/r/zipIncome.txt
Normal file
File diff suppressed because it is too large
Load Diff
Reference in New Issue
Block a user