ds: r -> rmd
This commit is contained in:
@ -146,7 +146,7 @@ function genTables() {
|
||||
break
|
||||
|
||||
case '$':
|
||||
const args_end = gen.indexOf('$', i+2)
|
||||
const args_end = gen.indexOf('$', i+1)
|
||||
const args = gen.slice(i+1, args_end).split(',')
|
||||
switch (args[0]) {
|
||||
case 'C':
|
||||
|
||||
@ -663,7 +663,7 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 48,
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
|
||||
@ -120,7 +120,7 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 3,
|
||||
"execution_count": null,
|
||||
"id": "61d898fb-a8d2-4d1c-a13f-2c4be6c18969",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
@ -154,7 +154,7 @@
|
||||
" {{\n",
|
||||
" key: \"storage_type\"\n",
|
||||
" value: {{ string_value: \"AUTO\" }}\n",
|
||||
" }}\n",
|
||||
" }}т\n",
|
||||
"]\n",
|
||||
"\n",
|
||||
"dynamic_batching {{\n",
|
||||
|
||||
@ -1,3 +1,12 @@
|
||||
---
|
||||
title: "Lab2: Introduction to R, exploring the box-plot"
|
||||
author: "Vladislav Litvinov <vlad@sek1ro>"
|
||||
output:
|
||||
pdf_document:
|
||||
toc_float: TRUE
|
||||
---
|
||||
# Data preparation
|
||||
```{r}
|
||||
df = read.csv("./zipIncome.txt", sep = "|")
|
||||
colnames(df) <- c("zipCode", "income")
|
||||
|
||||
@ -10,7 +19,9 @@ if (any(is.na(df[,2]))) {
|
||||
|
||||
mean(df$income)
|
||||
median(df$income, na.rm=TRUE)
|
||||
|
||||
```
|
||||
# Histograms and box-plot "whiskers"
|
||||
```{r}
|
||||
plot(y=df$income, x=df$zipCode, xlab="income", ylab="zipCode")
|
||||
df$incomelog = log10(df$income)
|
||||
hist(df$incomelog, breaks=80)
|
||||
@ -42,8 +53,9 @@ ggplot(df, aes(x=zipCode, y=income, color=zipCode)) +
|
||||
breaks = c(1e4, 25e3, 5e4, 1e5, 2e5, 5e5)
|
||||
) +
|
||||
labs(
|
||||
title = "Распределение доходов по почтовым индексам",
|
||||
title = "Income distribution by ZIP codes",
|
||||
subtitle = "Scatter plot jitter",
|
||||
) +
|
||||
theme_minimal()
|
||||
|
||||
```
|
||||
@ -1,9 +1,20 @@
|
||||
---
|
||||
title: "Lab4: K-means, DB-scan and dendrograms"
|
||||
author: "Vladislav Litvinov <vlad@sek1ro>"
|
||||
output:
|
||||
pdf_document:
|
||||
toc_float: TRUE
|
||||
---
|
||||
# Data preparationc
|
||||
```{r}
|
||||
setwd('/home/sek1ro/git/public/lab/ds/25-1/r')
|
||||
load("./income_elec_state.Rdata")
|
||||
df = income_elec_state
|
||||
df$incomelog = log10(df$income)
|
||||
remove(income_elec_state)
|
||||
|
||||
```
|
||||
# Function to compute Within-Cluster Sum of Squares for choosing optimal K
|
||||
```{r}
|
||||
elbow_wss = function(df) {
|
||||
max_k = 10
|
||||
wss = numeric(max_k)
|
||||
@ -19,7 +30,9 @@ elbow_wss = function(df) {
|
||||
wss_ratio = wss_diff[-1] / wss_diff[-length(wss_diff)]
|
||||
return(which.min(wss_ratio))
|
||||
}
|
||||
|
||||
```
|
||||
# Scatter-plot: elec vs income and log-income
|
||||
```{r}
|
||||
library(ggplot2)
|
||||
|
||||
plot_kmeans = function(data, k, log) {
|
||||
@ -74,7 +87,9 @@ klog = elbow_wss(datalog)
|
||||
plot_kmeans(data, k, log=FALSE)
|
||||
plot_kmeans(data, klog, log=TRUE)
|
||||
|
||||
|
||||
```
|
||||
# Map of USA
|
||||
```{r}
|
||||
library(maps)
|
||||
res = kmeans(data, centers = k)
|
||||
map_color = res$cluster[order(names(res$cluster))]
|
||||
@ -94,6 +109,10 @@ klog = elbow_wss(datalog)
|
||||
plot_kmeans(data, k, log=FALSE)
|
||||
plot_kmeans(data, klog, log=TRUE)
|
||||
|
||||
```
|
||||
# Differences within dendrogramm algo: 'single', 'complete', 'ward.D', 'average'
|
||||
```{r}
|
||||
|
||||
library(ggdendro)
|
||||
|
||||
|
||||
@ -136,3 +155,4 @@ clust = hclust(distance, method = "single")
|
||||
plot(ggdendrogram(clust))
|
||||
|
||||
cutree(clust, k = 3)
|
||||
```
|
||||
@ -1,3 +1,12 @@
|
||||
---
|
||||
title: "Lab5: Associative rules, Apriori"
|
||||
author: "Vladislav Litvinov <vlad@sek1ro>"
|
||||
output:
|
||||
pdf_document:
|
||||
toc_float: TRUE
|
||||
---
|
||||
# Histogram of transaction frequencies
|
||||
```{r}
|
||||
setwd('/home/sek1ro/git/public/lab/ds/25-1/r')
|
||||
library(arules)
|
||||
library(arulesViz)
|
||||
@ -10,7 +19,9 @@ ift = sort(itemFrequency(ts), decreasing = TRUE)
|
||||
|
||||
(most_frequent_item = ift[1])
|
||||
(max_ts_size = max(size(ts)))
|
||||
|
||||
```
|
||||
# Model training. Rules computing
|
||||
```{r}
|
||||
rules = apriori(ts, parameter = list(support = 0.01, confidence = 0))
|
||||
length(rules)
|
||||
plot(rules, jitter = 0)
|
||||
@ -18,7 +29,9 @@ plot(rules, jitter = 0)
|
||||
rules50 = apriori(ts, parameter = list(support = 0.01, confidence = 0.5))
|
||||
length(rules50)
|
||||
plot(rules50, jitter = 0)
|
||||
|
||||
```
|
||||
# Manual threshold applying: confidence = 0.5, plots comparsion
|
||||
```{r}
|
||||
library(ggplot2)
|
||||
asc = function(q, colors = c("lightgray", "red")) {
|
||||
q = q[order(q$lift), ]
|
||||
@ -39,9 +52,11 @@ asc(quality50, colors = c("navy", "cyan"))
|
||||
|
||||
quality = as.data.frame(quality(rules))
|
||||
asc(subset(quality, quality$confidence > 0.5))
|
||||
|
||||
plot(rules, measure = c("support", "lift"), engine = "interactive", shading = "confidence")
|
||||
plot(rules, engine = "interactive")
|
||||
```
|
||||
# Top 3 rules by lift. 'Relation' graph and matrix
|
||||
```{r}
|
||||
plot(rules, measure = c("support", "lift"), shading = "confidence")
|
||||
plot(rules)
|
||||
|
||||
filt_rules = rules[which(quality(rules)$confidence > 0.8)]
|
||||
quality = as.data.frame(quality(filt_rules))
|
||||
@ -55,8 +70,9 @@ plot(filt_rules,
|
||||
|
||||
top3_rules = head(sort(filt_rules, by = "lift", decreasing = TRUE), 3)
|
||||
plot(top3_rules, method = "graph")
|
||||
|
||||
|
||||
```
|
||||
# Random picking of train and test datasets
|
||||
```{r}
|
||||
train_set = ts[1:8000]
|
||||
test_set = ts[8001:10000]
|
||||
|
||||
@ -84,3 +100,4 @@ plot(comparison$train_support, comparison$test_support,
|
||||
ylab = "test support",
|
||||
pch = 19)
|
||||
abline(0, 1, lty = 2)
|
||||
```
|
||||
@ -1,3 +1,12 @@
|
||||
---
|
||||
title: "Lab6: Linear regression"
|
||||
author: "Vladislav Litvinov <vlad@sek1ro>"
|
||||
output:
|
||||
pdf_document:
|
||||
toc_float: TRUE
|
||||
---
|
||||
# Data preparation
|
||||
```{r}
|
||||
setwd('/home/sek1ro/git/public/lab/ds/25-1/r')
|
||||
df = read.csv('zeta.csv')
|
||||
head(df)
|
||||
@ -9,26 +18,31 @@ df = subset(df, 8 < meaneducation & meaneducation < 18 &
|
||||
|
||||
log_income = log10(df$meanhouseholdincome)
|
||||
colnames(df) = c("age", "education", "employment", "householdincome")
|
||||
|
||||
```
|
||||
# Linear regression graph
|
||||
```{r}
|
||||
library(ggplot2)
|
||||
ggplot(df, aes(x = age, y = log_income)) +
|
||||
geom_point() +
|
||||
geom_smooth(method = "lm")
|
||||
geom_smooth(method = "lm") +
|
||||
theme_minimal()
|
||||
|
||||
lmmod = lm(log_income ~ age, data = df)
|
||||
summary(lmmod)
|
||||
```
|
||||
|
||||
# xm = sum(xi) / n
|
||||
# sd = sqrt(sum(xi - xm) / (n - 1))
|
||||
# mr = sd / sqrt(n): вариация
|
||||
# t = (xm1 - xm2) / sqrt(mr1 ^ 2 + mr2 ^ 2)
|
||||
# f = (n1 + n2) - 2: степень свободы
|
||||
# p = 0,03 probability of obtaining test results at least as extreme as the result actually observed
|
||||
# Formulas for Student's t-test and p-value
|
||||
> xm = sum(xi) / n
|
||||
sd = sqrt(sum(xi - xm) / (n - 1))
|
||||
mr = sd / sqrt(n)
|
||||
t = (xm1 - xm2) / sqrt(mr1 ^ 2 + mr2 ^ 2)
|
||||
f = (n1 + n2) - 2
|
||||
p = 0,03 probability of obtaining test results at least as extreme as the result actually observed
|
||||
|
||||
```{r}
|
||||
ggplot(df, aes(x = education, y = log_income)) +
|
||||
geom_point() +
|
||||
geom_smooth(method = "lm")
|
||||
geom_smooth(method = "lm") +
|
||||
theme_minimal()
|
||||
|
||||
lmmod = lm(log_income ~ education, data = df)
|
||||
@ -36,15 +50,17 @@ summary(lmmod)
|
||||
|
||||
ggplot(df, aes(x = employment, y = log_income)) +
|
||||
geom_point() +
|
||||
geom_smooth(method = "lm")
|
||||
theme_minimal()
|
||||
geom_smooth(method = "lm") +
|
||||
theme_minimal()
|
||||
|
||||
lmmod = lm(log_income ~ employment, data = df)
|
||||
summary(lmmod)
|
||||
|
||||
lmmod = lm(householdincome ~ age + education + employment, data = df)
|
||||
summary(lmmod)
|
||||
|
||||
```
|
||||
# Random test and train datasets slices
|
||||
```{r}
|
||||
set.seed(Sys.Date())
|
||||
test_idx = sample(1:nrow(df), 5000, replace = FALSE)
|
||||
test_df = df[test_idx, ]
|
||||
@ -57,7 +73,9 @@ ggplot(test_df, aes(x = p_income, y = householdincome)) +
|
||||
geom_abline(intercept = 0, slope = 1) +
|
||||
geom_point()+
|
||||
theme_minimal()
|
||||
|
||||
```
|
||||
# Error measurements: MSE, RMSE, MAE, Bias
|
||||
```{r}
|
||||
test_df = test_df[order(test_df$p_income), ]
|
||||
slice_n = 10
|
||||
slice_size = floor(nrow(test_df) / slice_n)
|
||||
@ -76,4 +94,4 @@ for (i in 0 : (slice_n - 1)) {
|
||||
rmse, mae, bias)
|
||||
)
|
||||
}
|
||||
|
||||
```
|
||||
@ -1,3 +1,12 @@
|
||||
---
|
||||
title: "Lab7: Logistic regression"
|
||||
author: "Vladislav Litvinov <vlad@sek1ro>"
|
||||
output:
|
||||
pdf_document:
|
||||
toc_float: TRUE
|
||||
---
|
||||
# Data preparation
|
||||
```{r}
|
||||
setwd('/home/sek1ro/git/public/lab/ds/25-1/r')
|
||||
survey <- read.csv('survey.csv')
|
||||
|
||||
@ -8,9 +17,10 @@ survey$price30 <- ifelse(survey$Price == 30, 1, 0)
|
||||
head(survey)
|
||||
|
||||
survey$one <- 1
|
||||
|
||||
#https://stats.stackexchange.com/questions/48178/how-to-interpret-the-intercept-term-in-a-glm
|
||||
|
||||
```
|
||||
# Model training
|
||||
[Useful link 1](https://stats.stackexchange.com/questions/48178/how-to-interpret-the-intercept-term-in-a-glm)
|
||||
```{r}
|
||||
model <- glm(
|
||||
MYDEPV ~ Income + Age + price20 + price30,
|
||||
binomial(link = "logit"),
|
||||
@ -21,7 +31,9 @@ quantile(residuals(model))
|
||||
#https://library.virginia.edu/data/articles/understanding-deviance-residuals
|
||||
#Residuals are the differences between what we observe and what our model predicts.
|
||||
#Residuals greater than the absolute value of 3 are in the tails of a standard normal distribution and usually indicate strain in the model.
|
||||
|
||||
```
|
||||
# Getting coefficients
|
||||
```{r}
|
||||
beta_income <- coef(model)["Income"]
|
||||
pct_income <- (exp(beta_income) - 1) * 100
|
||||
pct_income
|
||||
@ -29,7 +41,9 @@ pct_income
|
||||
beta_price30 <- coef(model)["price30"]
|
||||
pct_price30 <- (exp(beta_price30 * 20) - 1) * 100
|
||||
pct_price30
|
||||
|
||||
```
|
||||
# Predicts for the model
|
||||
```{r}
|
||||
survey$odds_ratio <- exp(predict(model))
|
||||
survey$prediction <- survey$odds_ratio / (1 + survey$odds_ratio)
|
||||
head(survey)
|
||||
@ -46,3 +60,4 @@ new_person <- data.frame(
|
||||
|
||||
prob <- predict(model, new_person, type="response")
|
||||
prob
|
||||
```
|
||||
@ -1,18 +1,30 @@
|
||||
---
|
||||
title: "Lab8: Naive bayes classifier"
|
||||
author: "Vladislav Litvinov <vlad@sek1ro>"
|
||||
output:
|
||||
pdf_document:
|
||||
toc_float: TRUE
|
||||
---
|
||||
# Data splitting (test, train datasets)
|
||||
```{r}
|
||||
setwd('/home/sek1ro/git/public/lab/ds/25-1/r')
|
||||
df = read.csv("nbtrain.csv", stringsAsFactors = TRUE)
|
||||
trdf = df[1:9010,]
|
||||
tedf = df[9011:10010,]
|
||||
remove(df)
|
||||
library(e1071)
|
||||
|
||||
```
|
||||
# Model training
|
||||
```{r}
|
||||
nb = naiveBayes(income ~ age + sex + educ, data = trdf, laplace = 1)
|
||||
# p(A|B)*p(B) = p(AB) = p(B|A)*p(A)
|
||||
# p(A|B) = p(B|A) * p(A) / p(B)
|
||||
# апостер = услов * априор / маргин
|
||||
nb$apriori / sum (nb$apriori)
|
||||
nb$tables
|
||||
|
||||
|
||||
```
|
||||
# Model testing. Confidence total and confidence by class
|
||||
```{r}
|
||||
pd = predict(nb, tedf)
|
||||
(conf_mat = table(Actual = tedf$income, Predicted = pd))
|
||||
|
||||
@ -38,9 +50,9 @@ pd = predict(nb, tedf)
|
||||
(conf_mat = table(Actual = tedf$sex, Predicted = pd))
|
||||
conf_tot(conf_mat)
|
||||
conf_class(conf_mat)
|
||||
|
||||
|
||||
|
||||
```
|
||||
# Separated male and female
|
||||
```{r}
|
||||
male = trdf[trdf$sex == "M", ]
|
||||
female = trdf[trdf$sex == "F", ]
|
||||
|
||||
@ -63,3 +75,4 @@ nbrandom = function() {
|
||||
set.seed(Sys.time())
|
||||
nbrandom()
|
||||
|
||||
```
|
||||
Reference in New Issue
Block a user