ds: done
This commit is contained in:
@ -5,59 +5,87 @@ output:
|
||||
pdf_document:
|
||||
toc_float: TRUE
|
||||
---
|
||||
# Logit
|
||||
|
||||
```{r}
|
||||
n = 500
|
||||
breaks = 100
|
||||
odds = numeric(n)
|
||||
logit = numeric(n)
|
||||
for (i in 1:n) {
|
||||
p = runif(1)
|
||||
odds[i] = p / (1 - p)
|
||||
logit[i] = log(odds[i])
|
||||
}
|
||||
hist(odds, breaks = breaks)
|
||||
hist(logit, breaks = breaks)
|
||||
|
||||
data <- matrix(nrow=4, ncol=2, byrow=TRUE,
|
||||
data=c(2, 3,
|
||||
0, 3,
|
||||
0, 2,
|
||||
1, 2))
|
||||
|
||||
fisher.test(data)
|
||||
```
|
||||
# Data preparation
|
||||
```{r}
|
||||
setwd('/home/sek1ro/git/public/lab/ds/25-1/r')
|
||||
survey <- read.csv('survey.csv')
|
||||
surve = read.csv('survey.csv')
|
||||
|
||||
head(survey)
|
||||
|
||||
survey$price20 <- ifelse(survey$Price == 20, 1, 0)
|
||||
survey$price30 <- ifelse(survey$Price == 30, 1, 0)
|
||||
survey$price20 = ifelse(survey$Price == 20, 1, 0)
|
||||
survey$price30 = ifelse(survey$Price == 30, 1, 0)
|
||||
head(survey)
|
||||
|
||||
survey$one <- 1
|
||||
```
|
||||
# Model training
|
||||
[Useful link 1](https://stats.stackexchange.com/questions/48178/how-to-interpret-the-intercept-term-in-a-glm)
|
||||
|
||||
Residuals are the differences between what we observe and what our model predicts.
|
||||
Residuals greater than the absolute value of 3 are in the tails of a standard normal distribution and usually indicate strain in the model.
|
||||
|
||||
https://stats.stackexchange.com/questions/48178/how-to-interpret-the-intercept-term-in-a-glm
|
||||
https://library.virginia.edu/data/articles/understanding-deviance-residuals
|
||||
```{r}
|
||||
model <- glm(
|
||||
model = glm(
|
||||
MYDEPV ~ Income + Age + price20 + price30,
|
||||
binomial(link = "logit"),
|
||||
survey
|
||||
)
|
||||
summary(model)
|
||||
quantile(residuals(model))
|
||||
#https://library.virginia.edu/data/articles/understanding-deviance-residuals
|
||||
#Residuals are the differences between what we observe and what our model predicts.
|
||||
#Residuals greater than the absolute value of 3 are in the tails of a standard normal distribution and usually indicate strain in the model.
|
||||
```
|
||||
# Getting coefficients
|
||||
```{r}
|
||||
beta_income <- coef(model)["Income"]
|
||||
pct_income <- (exp(beta_income) - 1) * 100
|
||||
pct_income
|
||||
|
||||
beta_price30 <- coef(model)["price30"]
|
||||
pct_price30 <- (exp(beta_price30 * 20) - 1) * 100
|
||||
pct_price30
|
||||
```
|
||||
# Predicts for the model
|
||||
```{r}
|
||||
survey$odds_ratio <- exp(predict(model))
|
||||
survey$prediction <- survey$odds_ratio / (1 + survey$odds_ratio)
|
||||
survey$odds_ratio = exp(predict(model))
|
||||
survey$prediction = survey$odds_ratio / (1 + survey$odds_ratio)
|
||||
head(survey)
|
||||
|
||||
sum(survey$MYDEPV)
|
||||
sum(survey$prediction)
|
||||
|
||||
new_person <- data.frame(
|
||||
new_person = data.frame(
|
||||
Income = 58,
|
||||
Age = 25,
|
||||
price20 = 1,
|
||||
price30 = 0
|
||||
)
|
||||
|
||||
prob <- predict(model, new_person, type="response")
|
||||
prob = predict(model, new_person, type="response")
|
||||
prob
|
||||
```
|
||||
|
||||
```{r}
|
||||
library(ggplot2)
|
||||
predicted <- data.frame(
|
||||
prob=model$fitted.values,
|
||||
MYDEPV=survey$MYDEPV)
|
||||
|
||||
predicted <- predicted[order(predicted$prob, decreasing=FALSE),]
|
||||
predicted$rank <- 1:nrow(predicted)
|
||||
|
||||
ggplot(data=predicted, aes(x=rank, y=prob)) +
|
||||
geom_point(aes(color=MYDEPV), alpha=0.5, shape=4, stroke=1) +
|
||||
xlab("Index") +
|
||||
ylab("MYDEPV")
|
||||
```
|
||||
Reference in New Issue
Block a user