ds: r -> rmd
This commit is contained in:
61
ds/25-1/r/2.rmd
Normal file
61
ds/25-1/r/2.rmd
Normal file
@ -0,0 +1,61 @@
|
||||
---
|
||||
title: "Lab2: Introduction to R, exploring the box-plot"
|
||||
author: "Vladislav Litvinov <vlad@sek1ro>"
|
||||
output:
|
||||
pdf_document:
|
||||
toc_float: TRUE
|
||||
---
|
||||
# Data preparation
|
||||
```{r}
|
||||
df = read.csv("./zipIncome.txt", sep = "|")
|
||||
colnames(df) <- c("zipCode", "income")
|
||||
|
||||
summary(df)
|
||||
mean(df$income)
|
||||
if (any(is.na(df[,2]))) {
|
||||
na = which(is.na(df[,2]))
|
||||
df = df[-na,]
|
||||
}
|
||||
|
||||
mean(df$income)
|
||||
median(df$income, na.rm=TRUE)
|
||||
```
|
||||
# Histograms and box-plot "whiskers"
|
||||
```{r}
|
||||
plot(y=df$income, x=df$zipCode, xlab="income", ylab="zipCode")
|
||||
df$incomelog = log10(df$income)
|
||||
hist(df$incomelog, breaks=80)
|
||||
print(min_incomelog <- log10(7e3))
|
||||
print(max_incomelog <- log10(2e5))
|
||||
print(avg_incomelog <- median(df$incomelog))
|
||||
|
||||
df <- subset(df, 7e3 < df$income & df$income < 2e5)
|
||||
hist(df$incomelog, breaks=80)
|
||||
summary(df)
|
||||
|
||||
boxplot(incomelog ~ zipCode, data=df, main="boxplot", xlab="zipCode", ylab="incomelog")
|
||||
|
||||
library(ggplot2)
|
||||
|
||||
ggplot(df, aes(x=zipCode, y=income, color=zipCode)) +
|
||||
geom_point(
|
||||
position = position_jitter(width = 0.2),
|
||||
alpha = 0.2,
|
||||
) +
|
||||
geom_boxplot(
|
||||
alpha = 0.5,
|
||||
outlier.shape = NA,
|
||||
width = 0.6,
|
||||
fill = "white",
|
||||
color = "black"
|
||||
) +
|
||||
scale_y_log10(
|
||||
breaks = c(1e4, 25e3, 5e4, 1e5, 2e5, 5e5)
|
||||
) +
|
||||
labs(
|
||||
title = "Income distribution by ZIP codes",
|
||||
subtitle = "Scatter plot jitter",
|
||||
) +
|
||||
theme_minimal()
|
||||
|
||||
```
|
||||
Reference in New Issue
Block a user