df = read.csv("./zipIncome.txt", sep = "|") colnames(df) <- c("zipCode", "income") summary(df) mean(df$income) if (any(is.na(df[,2]))) { na = which(is.na(df[,2])) df = df[-na,] } mean(df$income) median(df$income, na.rm=TRUE) plot(y=df$income, x=df$zipCode, xlab="income", ylab="zipCode") df$incomelog = log10(df$income) hist(df$incomelog, breaks=80) print(min_incomelog <- log10(7e3)) print(max_incomelog <- log10(2e5)) print(avg_incomelog <- median(df$incomelog)) df <- subset(df, 7e3 < df$income & df$income < 2e5) hist(df$incomelog, breaks=80) summary(df) boxplot(incomelog ~ zipCode, data=df, main="boxplot", xlab="zipCode", ylab="incomelog") library(ggplot2) ggplot(df, aes(x=zipCode, y=income, color=zipCode)) + geom_point( position = position_jitter(width = 0.2), alpha = 0.2, ) + geom_boxplot( alpha = 0.5, outlier.shape = NA, width = 0.6, fill = "white", color = "black" ) + scale_y_log10( breaks = c(1e4, 25e3, 5e4, 1e5, 2e5, 5e5) ) + labs( title = "Распределение доходов по почтовым индексам", subtitle = "Scatter plot jitter", ) + theme_minimal()