ds: r -> rmd

2025-12-27 22:36:25 +03:00
parent f2c718fc3f
commit 1798c69564
9 changed files with 136 additions and 41 deletions
--- a/circuit/25-2/course_project/index.js
+++ b/circuit/25-2/course_project/index.js
@ -146,7 +146,7 @@ function genTables() {
          break

        case '$':
-          const args_end = gen.indexOf('$', i+2)
+          const args_end = gen.indexOf('$', i+1)
          const args = gen.slice(i+1, args_end).split(',')
          switch (args[0]) {
            case 'C':
--- a/ds/25-1/2/3-06_xgboost.ipynb
+++ b/ds/25-1/2/3-06_xgboost.ipynb
@ -663,7 +663,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 48,
+   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
--- a/ds/25-1/2/3-07_triton.ipynb
+++ b/ds/25-1/2/3-07_triton.ipynb
@ -120,7 +120,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 3,
+   "execution_count": null,
   "id": "61d898fb-a8d2-4d1c-a13f-2c4be6c18969",
   "metadata": {},
   "outputs": [],
@ -154,7 +154,7 @@
    "  {{\n",
    "    key: \"storage_type\"\n",
    "    value: {{ string_value: \"AUTO\" }}\n",
-    "  }}\n",
+    "  }}т\n",
    "]\n",
    "\n",
    "dynamic_batching {{\n",
--- a/ds/25-1/r/2.rmd
+++ b/ds/25-1/r/2.rmd
@ -1,3 +1,12 @@
+---
+title: "Lab2: Introduction to R, exploring the box-plot"
+author: "Vladislav Litvinov <vlad@sek1ro>"
+output:
+  pdf_document:
+  toc_float: TRUE
+---
+# Data preparation
+```{r}
 df = read.csv("./zipIncome.txt", sep = "|")
 colnames(df) <- c("zipCode", "income")

@ -10,7 +19,9 @@ if (any(is.na(df[,2]))) {

 mean(df$income)
 median(df$income, na.rm=TRUE)
-
+```
+# Histograms and box-plot "whiskers"
+```{r}
 plot(y=df$income, x=df$zipCode, xlab="income", ylab="zipCode")
 df$incomelog = log10(df$income)
 hist(df$incomelog, breaks=80)
@ -42,8 +53,9 @@ ggplot(df, aes(x=zipCode, y=income, color=zipCode)) +
    breaks = c(1e4, 25e3, 5e4, 1e5, 2e5, 5e5)
  ) +
  labs(
-    title = "Распределение доходов по почтовым индексам",
+    title = "Income distribution by ZIP codes",
    subtitle = "Scatter plot jitter",
  ) +
  theme_minimal()
  
+```
--- a/ds/25-1/r/4.rmd
+++ b/ds/25-1/r/4.rmd
@ -1,9 +1,20 @@
+---
+title: "Lab4: K-means, DB-scan and dendrograms"
+author: "Vladislav Litvinov <vlad@sek1ro>"
+output:
+  pdf_document:
+  toc_float: TRUE
+---
+# Data preparationc
+```{r}
 setwd('/home/sek1ro/git/public/lab/ds/25-1/r')
 load("./income_elec_state.Rdata")
 df = income_elec_state
 df$incomelog = log10(df$income)
 remove(income_elec_state)
-
+```
+# Function to compute Within-Cluster Sum of Squares for choosing optimal K
+```{r}
 elbow_wss = function(df) {
  max_k = 10
  wss = numeric(max_k)
@ -19,7 +30,9 @@ elbow_wss = function(df) {
  wss_ratio = wss_diff[-1] / wss_diff[-length(wss_diff)]
  return(which.min(wss_ratio))
 }
-
+```
+# Scatter-plot: elec vs income and log-income
+```{r}
 library(ggplot2)

 plot_kmeans = function(data, k, log) {
@ -74,7 +87,9 @@ klog = elbow_wss(datalog)
 plot_kmeans(data, k, log=FALSE)
 plot_kmeans(data, klog, log=TRUE)

-
+```
+# Map of USA
+```{r}
 library(maps)
 res = kmeans(data, centers = k)
 map_color = res$cluster[order(names(res$cluster))]
@ -94,6 +109,10 @@ klog = elbow_wss(datalog)
 plot_kmeans(data, k, log=FALSE)
 plot_kmeans(data, klog, log=TRUE)

+```
+# Differences within dendrogramm algo: 'single', 'complete', 'ward.D', 'average'
+```{r}
+
 library(ggdendro)


@ -136,3 +155,4 @@ clust = hclust(distance, method = "single")
 plot(ggdendrogram(clust))

 cutree(clust, k = 3)
+```
--- a/ds/25-1/r/5.rmd
+++ b/ds/25-1/r/5.rmd
@ -1,3 +1,12 @@
+---
+title: "Lab5: Associative rules, Apriori"
+author: "Vladislav Litvinov <vlad@sek1ro>"
+output:
+  pdf_document:
+  toc_float: TRUE
+---
+# Histogram of transaction frequencies
+```{r}
 setwd('/home/sek1ro/git/public/lab/ds/25-1/r')
 library(arules)
 library(arulesViz)
@ -10,7 +19,9 @@ ift = sort(itemFrequency(ts), decreasing = TRUE)

 (most_frequent_item = ift[1])
 (max_ts_size = max(size(ts)))
-
+```
+# Model training. Rules computing
+```{r}
 rules = apriori(ts, parameter = list(support = 0.01, confidence = 0))
 length(rules)
 plot(rules, jitter = 0)
@ -18,7 +29,9 @@ plot(rules, jitter = 0)
 rules50 = apriori(ts, parameter = list(support = 0.01, confidence = 0.5))
 length(rules50)
 plot(rules50, jitter = 0)
-
+```
+# Manual threshold applying: confidence = 0.5, plots comparsion
+```{r}
 library(ggplot2)
 asc = function(q, colors = c("lightgray", "red")) {
  q = q[order(q$lift), ]
@ -39,9 +52,11 @@ asc(quality50, colors = c("navy", "cyan"))

 quality = as.data.frame(quality(rules))
 asc(subset(quality, quality$confidence > 0.5))
-
-plot(rules, measure = c("support", "lift"), engine = "interactive", shading = "confidence")
-plot(rules, engine = "interactive")
+```
+# Top 3 rules by lift. 'Relation' graph and matrix
+```{r}
+plot(rules, measure = c("support", "lift"), shading = "confidence")
+plot(rules)

 filt_rules = rules[which(quality(rules)$confidence > 0.8)]
 quality = as.data.frame(quality(filt_rules))
@ -55,8 +70,9 @@ plot(filt_rules,

 top3_rules = head(sort(filt_rules, by = "lift", decreasing = TRUE), 3)
 plot(top3_rules, method = "graph")
-
-
+```
+# Random picking of train and test datasets
+```{r}
 train_set = ts[1:8000]
 test_set = ts[8001:10000]

@ -84,3 +100,4 @@ plot(comparison$train_support, comparison$test_support,
     ylab = "test support",
     pch = 19)
 abline(0, 1, lty = 2)
+```
--- a/ds/25-1/r/6.rmd
+++ b/ds/25-1/r/6.rmd
@ -1,3 +1,12 @@
+---
+title: "Lab6: Linear regression"
+author: "Vladislav Litvinov <vlad@sek1ro>"
+output:
+  pdf_document:
+  toc_float: TRUE
+---
+# Data preparation
+```{r}
 setwd('/home/sek1ro/git/public/lab/ds/25-1/r')
 df = read.csv('zeta.csv')
 head(df)
@ -9,26 +18,31 @@ df = subset(df, 8 < meaneducation & meaneducation < 18 &

 log_income = log10(df$meanhouseholdincome)
 colnames(df) = c("age", "education", "employment", "householdincome")
-
+```
+# Linear regression graph
+```{r}
 library(ggplot2)
 ggplot(df, aes(x = age, y = log_income)) +
  geom_point() +
-  geom_smooth(method = "lm")
+  geom_smooth(method = "lm") +
  theme_minimal()
  
 lmmod = lm(log_income ~ age, data = df)
 summary(lmmod)
+```

-# xm = sum(xi) / n
-# sd = sqrt(sum(xi - xm) / (n - 1))
-# mr = sd / sqrt(n): вариация
-# t = (xm1 - xm2) / sqrt(mr1 ^ 2 + mr2 ^ 2)
-# f = (n1 + n2) - 2: степень свободы
-# p = 0,03 probability of obtaining test results at least as extreme as the result actually observed
+# Formulas for Student's t-test and p-value
+> xm = sum(xi) / n
+sd = sqrt(sum(xi - xm) / (n - 1))
+mr = sd / sqrt(n)
+t = (xm1 - xm2) / sqrt(mr1 ^ 2 + mr2 ^ 2)
+f = (n1 + n2) - 2
+p = 0,03 probability of obtaining test results at least as extreme as the result actually observed

+```{r}
 ggplot(df, aes(x = education, y = log_income)) +
  geom_point() +
-  geom_smooth(method = "lm")
+  geom_smooth(method = "lm") +
  theme_minimal()

 lmmod = lm(log_income ~ education, data = df)
@ -36,15 +50,17 @@ summary(lmmod)

 ggplot(df, aes(x = employment, y = log_income)) +
  geom_point() +
-  geom_smooth(method = "lm")
-theme_minimal()
+  geom_smooth(method = "lm") +
+  theme_minimal()

 lmmod = lm(log_income ~ employment, data = df)
 summary(lmmod)

 lmmod = lm(householdincome ~ age + education + employment, data = df)
 summary(lmmod)
-
+```
+# Random test and train datasets slices
+```{r}
 set.seed(Sys.Date())
 test_idx = sample(1:nrow(df), 5000, replace = FALSE)
 test_df = df[test_idx, ]
@ -57,7 +73,9 @@ ggplot(test_df, aes(x = p_income, y = householdincome)) +
  geom_abline(intercept = 0, slope = 1) +
  geom_point()+
  theme_minimal()
-
+```
+# Error measurements: MSE, RMSE, MAE, Bias
+```{r}
 test_df = test_df[order(test_df$p_income), ]
 slice_n = 10
 slice_size = floor(nrow(test_df) / slice_n)
@ -76,4 +94,4 @@ for (i in 0 : (slice_n - 1)) {
              rmse, mae, bias)
      )
 }
-
+```
--- a/ds/25-1/r/7.rmd
+++ b/ds/25-1/r/7.rmd
@ -1,3 +1,12 @@
+---
+title: "Lab7: Logistic regression"
+author: "Vladislav Litvinov <vlad@sek1ro>"
+output:
+  pdf_document:
+  toc_float: TRUE
+---
+# Data preparation
+```{r}
 setwd('/home/sek1ro/git/public/lab/ds/25-1/r')
 survey <- read.csv('survey.csv')

@ -8,9 +17,10 @@ survey$price30 <- ifelse(survey$Price == 30, 1, 0)
 head(survey)

 survey$one <- 1
-
-#https://stats.stackexchange.com/questions/48178/how-to-interpret-the-intercept-term-in-a-glm
-
+```
+# Model training
+[Useful link 1](https://stats.stackexchange.com/questions/48178/how-to-interpret-the-intercept-term-in-a-glm)
+```{r}
 model <- glm(
  MYDEPV ~ Income + Age + price20 + price30,
  binomial(link = "logit"),
@ -21,7 +31,9 @@ quantile(residuals(model))
 #https://library.virginia.edu/data/articles/understanding-deviance-residuals
 #Residuals are the differences between what we observe and what our model predicts.
 #Residuals greater than the absolute value of 3 are in the tails of a standard normal distribution and usually indicate strain in the model.
-
+```
+# Getting coefficients
+```{r}
 beta_income <- coef(model)["Income"]
 pct_income <- (exp(beta_income) - 1) * 100
 pct_income
@ -29,7 +41,9 @@ pct_income
 beta_price30 <- coef(model)["price30"]
 pct_price30 <- (exp(beta_price30 * 20) - 1) * 100
 pct_price30
-
+```
+# Predicts for the model
+```{r}
 survey$odds_ratio <- exp(predict(model))
 survey$prediction <- survey$odds_ratio / (1 + survey$odds_ratio)
 head(survey)
@ -46,3 +60,4 @@ new_person <- data.frame(

 prob <- predict(model, new_person, type="response")
 prob
+```
--- a/ds/25-1/r/8.rmd
+++ b/ds/25-1/r/8.rmd
@ -1,18 +1,30 @@
+---
+title: "Lab8: Naive bayes classifier"
+author: "Vladislav Litvinov <vlad@sek1ro>"
+output:
+  pdf_document:
+  toc_float: TRUE
+---
+# Data splitting (test, train datasets)
+```{r}
 setwd('/home/sek1ro/git/public/lab/ds/25-1/r')
 df = read.csv("nbtrain.csv", stringsAsFactors = TRUE)
 trdf = df[1:9010,]
 tedf = df[9011:10010,]
 remove(df)
 library(e1071)
-
+```
+# Model training
+```{r}
 nb = naiveBayes(income ~ age + sex + educ, data = trdf, laplace = 1)
 # p(A|B)*p(B) = p(AB) = p(B|A)*p(A)
 # p(A|B) = p(B|A) * p(A) / p(B)
 # апостер = услов * априор / маргин
 nb$apriori / sum (nb$apriori)
 nb$tables
-
-
+```
+# Model testing. Confidence total and confidence by class 
+```{r}
 pd = predict(nb, tedf)
 (conf_mat = table(Actual = tedf$income, Predicted = pd))

@ -38,9 +50,9 @@ pd = predict(nb, tedf)
 (conf_mat = table(Actual = tedf$sex, Predicted = pd))
 conf_tot(conf_mat)
 conf_class(conf_mat)
-
-
-
+```
+# Separated male and female
+```{r}
 male = trdf[trdf$sex == "M", ]
 female = trdf[trdf$sex == "F", ]

@ -63,3 +75,4 @@ nbrandom = function() {
 set.seed(Sys.time())
 nbrandom()

+```