setwd('/home/sek1ro/git/public/lab/ds/25-1/r') df = read.csv('zeta.csv') head(df) df = df[df$sex == 'F', -which(names(df) %in% c("X", "zcta", "sex"))] df = subset(df, 8 < meaneducation & meaneducation < 18 & 10000 < meanhouseholdincome & meanhouseholdincome < 200000 & 0 < meanemployment & meanemployment < 3 & 20 < meanage & meanage < 60) log_income = log10(df$meanhouseholdincome) colnames(df) = c("age", "education", "employment", "householdincome") library(ggplot2) ggplot(df, aes(x = age, y = log_income)) + geom_point() + geom_smooth(method = "lm") theme_minimal() lmmod = lm(log_income ~ age, data = df) summary(lmmod) # xm = sum(xi) / n # sd = sqrt(sum(xi - xm) / (n - 1)) # mr = sd / sqrt(n): вариация # t = (xm1 - xm2) / sqrt(mr1 ^ 2 + mr2 ^ 2) # f = (n1 + n2) - 2: степень свободы # p: по таблице: Отличаются с вероятностью ошибки ... ggplot(df, aes(x = education, y = log_income)) + geom_point() + geom_smooth(method = "lm") theme_minimal() lmmod = lm(log_income ~ education, data = df) summary(lmmod) ggplot(df, aes(x = employment, y = log_income)) + geom_point() + geom_smooth(method = "lm") theme_minimal() lmmod = lm(log_income ~ employment, data = df) summary(lmmod) lmmod = lm(householdincome ~ age + education + employment, data = df) summary(lmmod) set.seed(Sys.Date()) test_idx = sample(1:nrow(df), 5000, replace = FALSE) test_df = df[test_idx, ] train_df = df[-test_idx, ] lmmod = lm(householdincome ~ age + education + employment, data = train_df) test_df$p_income = predict(lmmod, newdata = test_df) ggplot(test_df, aes(x = p_income, y = householdincome)) + geom_abline(intercept = 0, slope = 1) + geom_point()+ theme_minimal() test_df = test_df[order(test_df$p_income), ] slice_n = 10 slice_size = floor(nrow(test_df) / slice_n) for (i in 0 : (slice_n - 1)) { start_idx = slice_size * i + 1 end_idx = ifelse(i == slice_n - 1, nrow(test_df), slice_size * (i + 1)) slice = test_df[start_idx : end_idx,] error = slice$p_income - slice$householdincome rmse = sqrt(mean(error ^ 2)) mae = mean(abs(error)) bias = mean(error) cat(sprintf("range: %.2f-%.2f, RMSE: %.2f, MAE: %.2f, Bias: %.2f\n", test_df[start_idx, ]$p_income, test_df[end_idx, ]$p_income, rmse, mae, bias) ) }