ds: r6
This commit is contained in:
79
ds/25-1/r/6.R
Normal file
79
ds/25-1/r/6.R
Normal file
@ -0,0 +1,79 @@
|
||||
setwd('/home/sek1ro/git/public/lab/ds/25-1/r')
|
||||
df = read.csv('zeta.csv')
|
||||
head(df)
|
||||
df = df[df$sex == 'F', -which(names(df) %in% c("X", "zcta", "sex"))]
|
||||
df = subset(df, 8 < meaneducation & meaneducation < 18 &
|
||||
10000 < meanhouseholdincome & meanhouseholdincome < 200000 &
|
||||
0 < meanemployment & meanemployment < 3 &
|
||||
20 < meanage & meanage < 60)
|
||||
|
||||
log_income = log10(df$meanhouseholdincome)
|
||||
colnames(df) = c("age", "education", "employment", "householdincome")
|
||||
|
||||
library(ggplot2)
|
||||
ggplot(df, aes(x = age, y = log_income)) +
|
||||
geom_point() +
|
||||
geom_smooth(method = "lm")
|
||||
theme_minimal()
|
||||
|
||||
lmmod = lm(log_income ~ age, data = df)
|
||||
summary(lmmod)
|
||||
|
||||
# xm = sum(xi) / n
|
||||
# sd = sqrt(sum(xi - xm) / (n - 1))
|
||||
# mr = sd / sqrt(n): вариация
|
||||
# t = (xm1 - xm2) / sqrt(mr1 ^ 2 + mr2 ^ 2)
|
||||
# f = (n1 + n2) - 2: степень свободы
|
||||
# p: по таблице: Отличаются с вероятностью ошибки ...
|
||||
|
||||
ggplot(df, aes(x = education, y = log_income)) +
|
||||
geom_point() +
|
||||
geom_smooth(method = "lm")
|
||||
theme_minimal()
|
||||
|
||||
lmmod = lm(log_income ~ education, data = df)
|
||||
summary(lmmod)
|
||||
|
||||
ggplot(df, aes(x = employment, y = log_income)) +
|
||||
geom_point() +
|
||||
geom_smooth(method = "lm")
|
||||
theme_minimal()
|
||||
|
||||
lmmod = lm(log_income ~ employment, data = df)
|
||||
summary(lmmod)
|
||||
|
||||
lmmod = lm(householdincome ~ age + education + employment, data = df)
|
||||
summary(lmmod)
|
||||
|
||||
set.seed(Sys.Date())
|
||||
test_idx = sample(1:nrow(df), 5000, replace = FALSE)
|
||||
test_df = df[test_idx, ]
|
||||
train_df = df[-test_idx, ]
|
||||
|
||||
lmmod = lm(householdincome ~ age + education + employment, data = train_df)
|
||||
test_df$p_income = predict(lmmod, newdata = test_df)
|
||||
|
||||
ggplot(test_df, aes(x = p_income, y = householdincome)) +
|
||||
geom_abline(intercept = 0, slope = 1) +
|
||||
geom_point()+
|
||||
theme_minimal()
|
||||
|
||||
test_df = test_df[order(test_df$p_income), ]
|
||||
slice_n = 10
|
||||
slice_size = floor(nrow(test_df) / slice_n)
|
||||
for (i in 0 : (slice_n - 1)) {
|
||||
start_idx = slice_size * i + 1
|
||||
end_idx = ifelse(i == slice_n - 1, nrow(test_df), slice_size * (i + 1))
|
||||
slice = test_df[start_idx : end_idx,]
|
||||
|
||||
error = slice$p_income - slice$householdincome
|
||||
rmse = sqrt(mean(error ^ 2))
|
||||
mae = mean(abs(error))
|
||||
bias = mean(error)
|
||||
cat(sprintf("range: %.2f-%.2f, RMSE: %.2f, MAE: %.2f, Bias: %.2f\n",
|
||||
test_df[start_idx, ]$p_income,
|
||||
test_df[end_idx, ]$p_income,
|
||||
rmse, mae, bias)
|
||||
)
|
||||
}
|
||||
|
||||
64077
ds/25-1/r/zeta.csv
Normal file
64077
ds/25-1/r/zeta.csv
Normal file
File diff suppressed because it is too large
Load Diff
Reference in New Issue
Block a user