ds: done

2026-02-13 14:03:28 +03:00
parent 417326498e
commit 65218abfb1
159 changed files with 2577567 additions and 2553 deletions
--- a/ds/25-1/r/7.rmd
+++ b/ds/25-1/r/7.rmd
@ -5,59 +5,87 @@ output:
  pdf_document:
  toc_float: TRUE
 ---
+# Logit
+
+```{r}
+n = 500
+breaks = 100
+odds = numeric(n)
+logit = numeric(n)
+for (i in 1:n) {
+  p = runif(1)
+  odds[i] = p / (1 - p)
+  logit[i] = log(odds[i])
+}
+hist(odds, breaks = breaks)
+hist(logit, breaks = breaks)
+
+data <- matrix(nrow=4, ncol=2, byrow=TRUE, 
+               data=c(2, 3,
+                      0, 3,
+                      0, 2,
+                      1, 2))
+
+fisher.test(data)
+```
 # Data preparation
 ```{r}
 setwd('/home/sek1ro/git/public/lab/ds/25-1/r')
-survey <- read.csv('survey.csv')
+surve = read.csv('survey.csv')

 head(survey)

-survey$price20 <- ifelse(survey$Price == 20, 1, 0)
-survey$price30 <- ifelse(survey$Price == 30, 1, 0)
+survey$price20 = ifelse(survey$Price == 20, 1, 0)
+survey$price30 = ifelse(survey$Price == 30, 1, 0)
 head(survey)
-
-survey$one <- 1
 ```
 # Model training
-[Useful link 1](https://stats.stackexchange.com/questions/48178/how-to-interpret-the-intercept-term-in-a-glm)
+
+Residuals are the differences between what we observe and what our model predicts.
+Residuals greater than the absolute value of 3 are in the tails of a standard normal distribution and usually indicate strain in the model.
+
+https://stats.stackexchange.com/questions/48178/how-to-interpret-the-intercept-term-in-a-glm
+https://library.virginia.edu/data/articles/understanding-deviance-residuals
 ```{r}
-model <- glm(
+model = glm(
  MYDEPV ~ Income + Age + price20 + price30,
  binomial(link = "logit"),
  survey
 )
 summary(model)
 quantile(residuals(model))
-#https://library.virginia.edu/data/articles/understanding-deviance-residuals
-#Residuals are the differences between what we observe and what our model predicts.
-#Residuals greater than the absolute value of 3 are in the tails of a standard normal distribution and usually indicate strain in the model.
-```
-# Getting coefficients
-```{r}
-beta_income <- coef(model)["Income"]
-pct_income <- (exp(beta_income) - 1) * 100
-pct_income
-
-beta_price30 <- coef(model)["price30"]
-pct_price30 <- (exp(beta_price30 * 20) - 1) * 100
-pct_price30
 ```
 # Predicts for the model
 ```{r}
-survey$odds_ratio <- exp(predict(model))
-survey$prediction <- survey$odds_ratio / (1 + survey$odds_ratio)
+survey$odds_ratio = exp(predict(model))
+survey$prediction = survey$odds_ratio / (1 + survey$odds_ratio)
 head(survey)

 sum(survey$MYDEPV)
 sum(survey$prediction)

-new_person <- data.frame(
+new_person = data.frame(
  Income = 58,
  Age = 25,
  price20 = 1,
  price30 = 0
 )

-prob <- predict(model, new_person, type="response")
+prob = predict(model, new_person, type="response")
 prob
+```
+
+```{r}
+library(ggplot2)
+predicted <- data.frame(
+  prob=model$fitted.values,
+  MYDEPV=survey$MYDEPV)
+ 
+predicted <- predicted[order(predicted$prob, decreasing=FALSE),]
+predicted$rank <- 1:nrow(predicted)
+ 
+ggplot(data=predicted, aes(x=rank, y=prob)) +
+  geom_point(aes(color=MYDEPV), alpha=0.5, shape=4, stroke=1) +
+  xlab("Index") +
+  ylab("MYDEPV")
 ```