ds: r -> rmd

This commit is contained in:
2025-12-27 22:36:25 +03:00
parent f2c718fc3f
commit 1798c69564
9 changed files with 136 additions and 41 deletions

View File

@ -146,7 +146,7 @@ function genTables() {
break
case '$':
const args_end = gen.indexOf('$', i+2)
const args_end = gen.indexOf('$', i+1)
const args = gen.slice(i+1, args_end).split(',')
switch (args[0]) {
case 'C':

View File

@ -663,7 +663,7 @@
},
{
"cell_type": "code",
"execution_count": 48,
"execution_count": null,
"metadata": {},
"outputs": [
{

View File

@ -120,7 +120,7 @@
},
{
"cell_type": "code",
"execution_count": 3,
"execution_count": null,
"id": "61d898fb-a8d2-4d1c-a13f-2c4be6c18969",
"metadata": {},
"outputs": [],
@ -154,7 +154,7 @@
" {{\n",
" key: \"storage_type\"\n",
" value: {{ string_value: \"AUTO\" }}\n",
" }}\n",
" }}т\n",
"]\n",
"\n",
"dynamic_batching {{\n",

View File

@ -1,3 +1,12 @@
---
title: "Lab2: Introduction to R, exploring the box-plot"
author: "Vladislav Litvinov <vlad@sek1ro>"
output:
pdf_document:
toc_float: TRUE
---
# Data preparation
```{r}
df = read.csv("./zipIncome.txt", sep = "|")
colnames(df) <- c("zipCode", "income")
@ -10,7 +19,9 @@ if (any(is.na(df[,2]))) {
mean(df$income)
median(df$income, na.rm=TRUE)
```
# Histograms and box-plot "whiskers"
```{r}
plot(y=df$income, x=df$zipCode, xlab="income", ylab="zipCode")
df$incomelog = log10(df$income)
hist(df$incomelog, breaks=80)
@ -42,8 +53,9 @@ ggplot(df, aes(x=zipCode, y=income, color=zipCode)) +
breaks = c(1e4, 25e3, 5e4, 1e5, 2e5, 5e5)
) +
labs(
title = "Распределение доходов по почтовым индексам",
title = "Income distribution by ZIP codes",
subtitle = "Scatter plot jitter",
) +
theme_minimal()
```

View File

@ -1,9 +1,20 @@
---
title: "Lab4: K-means, DB-scan and dendrograms"
author: "Vladislav Litvinov <vlad@sek1ro>"
output:
pdf_document:
toc_float: TRUE
---
# Data preparationc
```{r}
setwd('/home/sek1ro/git/public/lab/ds/25-1/r')
load("./income_elec_state.Rdata")
df = income_elec_state
df$incomelog = log10(df$income)
remove(income_elec_state)
```
# Function to compute Within-Cluster Sum of Squares for choosing optimal K
```{r}
elbow_wss = function(df) {
max_k = 10
wss = numeric(max_k)
@ -19,7 +30,9 @@ elbow_wss = function(df) {
wss_ratio = wss_diff[-1] / wss_diff[-length(wss_diff)]
return(which.min(wss_ratio))
}
```
# Scatter-plot: elec vs income and log-income
```{r}
library(ggplot2)
plot_kmeans = function(data, k, log) {
@ -74,7 +87,9 @@ klog = elbow_wss(datalog)
plot_kmeans(data, k, log=FALSE)
plot_kmeans(data, klog, log=TRUE)
```
# Map of USA
```{r}
library(maps)
res = kmeans(data, centers = k)
map_color = res$cluster[order(names(res$cluster))]
@ -94,6 +109,10 @@ klog = elbow_wss(datalog)
plot_kmeans(data, k, log=FALSE)
plot_kmeans(data, klog, log=TRUE)
```
# Differences within dendrogramm algo: 'single', 'complete', 'ward.D', 'average'
```{r}
library(ggdendro)
@ -136,3 +155,4 @@ clust = hclust(distance, method = "single")
plot(ggdendrogram(clust))
cutree(clust, k = 3)
```

View File

@ -1,3 +1,12 @@
---
title: "Lab5: Associative rules, Apriori"
author: "Vladislav Litvinov <vlad@sek1ro>"
output:
pdf_document:
toc_float: TRUE
---
# Histogram of transaction frequencies
```{r}
setwd('/home/sek1ro/git/public/lab/ds/25-1/r')
library(arules)
library(arulesViz)
@ -10,7 +19,9 @@ ift = sort(itemFrequency(ts), decreasing = TRUE)
(most_frequent_item = ift[1])
(max_ts_size = max(size(ts)))
```
# Model training. Rules computing
```{r}
rules = apriori(ts, parameter = list(support = 0.01, confidence = 0))
length(rules)
plot(rules, jitter = 0)
@ -18,7 +29,9 @@ plot(rules, jitter = 0)
rules50 = apriori(ts, parameter = list(support = 0.01, confidence = 0.5))
length(rules50)
plot(rules50, jitter = 0)
```
# Manual threshold applying: confidence = 0.5, plots comparsion
```{r}
library(ggplot2)
asc = function(q, colors = c("lightgray", "red")) {
q = q[order(q$lift), ]
@ -39,9 +52,11 @@ asc(quality50, colors = c("navy", "cyan"))
quality = as.data.frame(quality(rules))
asc(subset(quality, quality$confidence > 0.5))
plot(rules, measure = c("support", "lift"), engine = "interactive", shading = "confidence")
plot(rules, engine = "interactive")
```
# Top 3 rules by lift. 'Relation' graph and matrix
```{r}
plot(rules, measure = c("support", "lift"), shading = "confidence")
plot(rules)
filt_rules = rules[which(quality(rules)$confidence > 0.8)]
quality = as.data.frame(quality(filt_rules))
@ -55,8 +70,9 @@ plot(filt_rules,
top3_rules = head(sort(filt_rules, by = "lift", decreasing = TRUE), 3)
plot(top3_rules, method = "graph")
```
# Random picking of train and test datasets
```{r}
train_set = ts[1:8000]
test_set = ts[8001:10000]
@ -84,3 +100,4 @@ plot(comparison$train_support, comparison$test_support,
ylab = "test support",
pch = 19)
abline(0, 1, lty = 2)
```

View File

@ -1,3 +1,12 @@
---
title: "Lab6: Linear regression"
author: "Vladislav Litvinov <vlad@sek1ro>"
output:
pdf_document:
toc_float: TRUE
---
# Data preparation
```{r}
setwd('/home/sek1ro/git/public/lab/ds/25-1/r')
df = read.csv('zeta.csv')
head(df)
@ -9,26 +18,31 @@ df = subset(df, 8 < meaneducation & meaneducation < 18 &
log_income = log10(df$meanhouseholdincome)
colnames(df) = c("age", "education", "employment", "householdincome")
```
# Linear regression graph
```{r}
library(ggplot2)
ggplot(df, aes(x = age, y = log_income)) +
geom_point() +
geom_smooth(method = "lm")
geom_smooth(method = "lm") +
theme_minimal()
lmmod = lm(log_income ~ age, data = df)
summary(lmmod)
```
# xm = sum(xi) / n
# sd = sqrt(sum(xi - xm) / (n - 1))
# mr = sd / sqrt(n): вариация
# t = (xm1 - xm2) / sqrt(mr1 ^ 2 + mr2 ^ 2)
# f = (n1 + n2) - 2: степень свободы
# p = 0,03 probability of obtaining test results at least as extreme as the result actually observed
# Formulas for Student's t-test and p-value
> xm = sum(xi) / n
sd = sqrt(sum(xi - xm) / (n - 1))
mr = sd / sqrt(n)
t = (xm1 - xm2) / sqrt(mr1 ^ 2 + mr2 ^ 2)
f = (n1 + n2) - 2
p = 0,03 probability of obtaining test results at least as extreme as the result actually observed
```{r}
ggplot(df, aes(x = education, y = log_income)) +
geom_point() +
geom_smooth(method = "lm")
geom_smooth(method = "lm") +
theme_minimal()
lmmod = lm(log_income ~ education, data = df)
@ -36,15 +50,17 @@ summary(lmmod)
ggplot(df, aes(x = employment, y = log_income)) +
geom_point() +
geom_smooth(method = "lm")
theme_minimal()
geom_smooth(method = "lm") +
theme_minimal()
lmmod = lm(log_income ~ employment, data = df)
summary(lmmod)
lmmod = lm(householdincome ~ age + education + employment, data = df)
summary(lmmod)
```
# Random test and train datasets slices
```{r}
set.seed(Sys.Date())
test_idx = sample(1:nrow(df), 5000, replace = FALSE)
test_df = df[test_idx, ]
@ -57,7 +73,9 @@ ggplot(test_df, aes(x = p_income, y = householdincome)) +
geom_abline(intercept = 0, slope = 1) +
geom_point()+
theme_minimal()
```
# Error measurements: MSE, RMSE, MAE, Bias
```{r}
test_df = test_df[order(test_df$p_income), ]
slice_n = 10
slice_size = floor(nrow(test_df) / slice_n)
@ -76,4 +94,4 @@ for (i in 0 : (slice_n - 1)) {
rmse, mae, bias)
)
}
```

View File

@ -1,3 +1,12 @@
---
title: "Lab7: Logistic regression"
author: "Vladislav Litvinov <vlad@sek1ro>"
output:
pdf_document:
toc_float: TRUE
---
# Data preparation
```{r}
setwd('/home/sek1ro/git/public/lab/ds/25-1/r')
survey <- read.csv('survey.csv')
@ -8,9 +17,10 @@ survey$price30 <- ifelse(survey$Price == 30, 1, 0)
head(survey)
survey$one <- 1
#https://stats.stackexchange.com/questions/48178/how-to-interpret-the-intercept-term-in-a-glm
```
# Model training
[Useful link 1](https://stats.stackexchange.com/questions/48178/how-to-interpret-the-intercept-term-in-a-glm)
```{r}
model <- glm(
MYDEPV ~ Income + Age + price20 + price30,
binomial(link = "logit"),
@ -21,7 +31,9 @@ quantile(residuals(model))
#https://library.virginia.edu/data/articles/understanding-deviance-residuals
#Residuals are the differences between what we observe and what our model predicts.
#Residuals greater than the absolute value of 3 are in the tails of a standard normal distribution and usually indicate strain in the model.
```
# Getting coefficients
```{r}
beta_income <- coef(model)["Income"]
pct_income <- (exp(beta_income) - 1) * 100
pct_income
@ -29,7 +41,9 @@ pct_income
beta_price30 <- coef(model)["price30"]
pct_price30 <- (exp(beta_price30 * 20) - 1) * 100
pct_price30
```
# Predicts for the model
```{r}
survey$odds_ratio <- exp(predict(model))
survey$prediction <- survey$odds_ratio / (1 + survey$odds_ratio)
head(survey)
@ -46,3 +60,4 @@ new_person <- data.frame(
prob <- predict(model, new_person, type="response")
prob
```

View File

@ -1,18 +1,30 @@
---
title: "Lab8: Naive bayes classifier"
author: "Vladislav Litvinov <vlad@sek1ro>"
output:
pdf_document:
toc_float: TRUE
---
# Data splitting (test, train datasets)
```{r}
setwd('/home/sek1ro/git/public/lab/ds/25-1/r')
df = read.csv("nbtrain.csv", stringsAsFactors = TRUE)
trdf = df[1:9010,]
tedf = df[9011:10010,]
remove(df)
library(e1071)
```
# Model training
```{r}
nb = naiveBayes(income ~ age + sex + educ, data = trdf, laplace = 1)
# p(A|B)*p(B) = p(AB) = p(B|A)*p(A)
# p(A|B) = p(B|A) * p(A) / p(B)
# апостер = услов * априор / маргин
nb$apriori / sum (nb$apriori)
nb$tables
```
# Model testing. Confidence total and confidence by class
```{r}
pd = predict(nb, tedf)
(conf_mat = table(Actual = tedf$income, Predicted = pd))
@ -38,9 +50,9 @@ pd = predict(nb, tedf)
(conf_mat = table(Actual = tedf$sex, Predicted = pd))
conf_tot(conf_mat)
conf_class(conf_mat)
```
# Separated male and female
```{r}
male = trdf[trdf$sex == "M", ]
female = trdf[trdf$sex == "F", ]
@ -63,3 +75,4 @@ nbrandom = function() {
set.seed(Sys.time())
nbrandom()
```