df = read.csv("./zipIncome.txt", sep = "|")
colnames(df) <- c("zipCode", "income")

summary(df)
mean(df$income)
if (any(is.na(df[,2]))) {
  na = which(is.na(df[,2]))
  df = df[-na,]
}

mean(df$income)
median(df$income, na.rm=TRUE)

plot(y=df$income, x=df$zipCode, xlab="income", ylab="zipCode")
df$incomelog = log10(df$income)
hist(df$incomelog, breaks=80)
print(min_incomelog <- log10(7e3))
print(max_incomelog <- log10(2e5))
print(avg_incomelog <- median(df$incomelog))

df <- subset(df, 7e3 < df$income & df$income < 2e5)
hist(df$incomelog, breaks=80)
summary(df)

boxplot(incomelog ~ zipCode, data=df, main="boxplot", xlab="zipCode", ylab="incomelog")

library(ggplot2)

ggplot(df, aes(x=zipCode, y=income, color=zipCode)) +
  geom_point(
    position = position_jitter(width = 0.2),
    alpha = 0.2,
  ) +
  geom_boxplot(
    alpha = 0.5,
    outlier.shape = NA,
    width = 0.6,
    fill = "white",
    color = "black"
  ) +
  scale_y_log10(
    breaks = c(1e4, 25e3, 5e4, 1e5, 2e5, 5e5)
  ) +
  labs(
    title = "Распределение доходов по почтовым индексам",
    subtitle = "Scatter plot jitter",
  ) +
  theme_minimal()