--- title: "Lab7: Logistic regression" author: "Vladislav Litvinov " output: pdf_document: toc_float: TRUE --- # Logit ```{r} n = 500 breaks = 100 odds = numeric(n) logit = numeric(n) for (i in 1:n) { p = runif(1) odds[i] = p / (1 - p) logit[i] = log(odds[i]) } hist(odds, breaks = breaks) hist(logit, breaks = breaks) data <- matrix(nrow=4, ncol=2, byrow=TRUE, data=c(2, 3, 0, 3, 0, 2, 1, 2)) fisher.test(data) ``` # Data preparation ```{r} setwd('/home/sek1ro/git/public/lab/ds/25-1/r') surve = read.csv('survey.csv') head(survey) survey$price20 = ifelse(survey$Price == 20, 1, 0) survey$price30 = ifelse(survey$Price == 30, 1, 0) head(survey) ``` # Model training Residuals are the differences between what we observe and what our model predicts. Residuals greater than the absolute value of 3 are in the tails of a standard normal distribution and usually indicate strain in the model. https://stats.stackexchange.com/questions/48178/how-to-interpret-the-intercept-term-in-a-glm https://library.virginia.edu/data/articles/understanding-deviance-residuals ```{r} model = glm( MYDEPV ~ Income + Age + price20 + price30, binomial(link = "logit"), survey ) summary(model) quantile(residuals(model)) ``` # Predicts for the model ```{r} survey$odds_ratio = exp(predict(model)) survey$prediction = survey$odds_ratio / (1 + survey$odds_ratio) head(survey) sum(survey$MYDEPV) sum(survey$prediction) new_person = data.frame( Income = 58, Age = 25, price20 = 1, price30 = 0 ) prob = predict(model, new_person, type="response") prob ``` ```{r} library(ggplot2) predicted <- data.frame( prob=model$fitted.values, MYDEPV=survey$MYDEPV) predicted <- predicted[order(predicted$prob, decreasing=FALSE),] predicted$rank <- 1:nrow(predicted) ggplot(data=predicted, aes(x=rank, y=prob)) + geom_point(aes(color=MYDEPV), alpha=0.5, shape=4, stroke=1) + xlab("Index") + ylab("MYDEPV") ```