diff --git a/circuit/25-2/course_project/index.js b/circuit/25-2/course_project/index.js index 8a362db..bf5f510 100644 --- a/circuit/25-2/course_project/index.js +++ b/circuit/25-2/course_project/index.js @@ -146,7 +146,7 @@ function genTables() { break case '$': - const args_end = gen.indexOf('$', i+2) + const args_end = gen.indexOf('$', i+1) const args = gen.slice(i+1, args_end).split(',') switch (args[0]) { case 'C': diff --git a/ds/25-1/2/3-06_xgboost.ipynb b/ds/25-1/2/3-06_xgboost.ipynb index c8a67d0..3ccd0c0 100644 --- a/ds/25-1/2/3-06_xgboost.ipynb +++ b/ds/25-1/2/3-06_xgboost.ipynb @@ -663,7 +663,7 @@ }, { "cell_type": "code", - "execution_count": 48, + "execution_count": null, "metadata": {}, "outputs": [ { diff --git a/ds/25-1/2/3-07_triton.ipynb b/ds/25-1/2/3-07_triton.ipynb index 153a03c..57d454e 100644 --- a/ds/25-1/2/3-07_triton.ipynb +++ b/ds/25-1/2/3-07_triton.ipynb @@ -120,7 +120,7 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": null, "id": "61d898fb-a8d2-4d1c-a13f-2c4be6c18969", "metadata": {}, "outputs": [], @@ -154,7 +154,7 @@ " {{\n", " key: \"storage_type\"\n", " value: {{ string_value: \"AUTO\" }}\n", - " }}\n", + " }}т\n", "]\n", "\n", "dynamic_batching {{\n", diff --git a/ds/25-1/r/2.R b/ds/25-1/r/2.rmd similarity index 79% rename from ds/25-1/r/2.R rename to ds/25-1/r/2.rmd index 803537c..824b880 100644 --- a/ds/25-1/r/2.R +++ b/ds/25-1/r/2.rmd @@ -1,3 +1,12 @@ +--- +title: "Lab2: Introduction to R, exploring the box-plot" +author: "Vladislav Litvinov " +output: + pdf_document: + toc_float: TRUE +--- +# Data preparation +```{r} df = read.csv("./zipIncome.txt", sep = "|") colnames(df) <- c("zipCode", "income") @@ -10,7 +19,9 @@ if (any(is.na(df[,2]))) { mean(df$income) median(df$income, na.rm=TRUE) - +``` +# Histograms and box-plot "whiskers" +```{r} plot(y=df$income, x=df$zipCode, xlab="income", ylab="zipCode") df$incomelog = log10(df$income) hist(df$incomelog, breaks=80) @@ -42,8 +53,9 @@ ggplot(df, aes(x=zipCode, y=income, color=zipCode)) + breaks = c(1e4, 25e3, 5e4, 1e5, 2e5, 5e5) ) + labs( - title = "Распределение доходов по почтовым индексам", + title = "Income distribution by ZIP codes", subtitle = "Scatter plot jitter", ) + theme_minimal() +``` \ No newline at end of file diff --git a/ds/25-1/r/4.R b/ds/25-1/r/4.rmd similarity index 86% rename from ds/25-1/r/4.R rename to ds/25-1/r/4.rmd index 063bf81..8abd54d 100644 --- a/ds/25-1/r/4.R +++ b/ds/25-1/r/4.rmd @@ -1,9 +1,20 @@ +--- +title: "Lab4: K-means, DB-scan and dendrograms" +author: "Vladislav Litvinov " +output: + pdf_document: + toc_float: TRUE +--- +# Data preparationc +```{r} setwd('/home/sek1ro/git/public/lab/ds/25-1/r') load("./income_elec_state.Rdata") df = income_elec_state df$incomelog = log10(df$income) remove(income_elec_state) - +``` +# Function to compute Within-Cluster Sum of Squares for choosing optimal K +```{r} elbow_wss = function(df) { max_k = 10 wss = numeric(max_k) @@ -19,7 +30,9 @@ elbow_wss = function(df) { wss_ratio = wss_diff[-1] / wss_diff[-length(wss_diff)] return(which.min(wss_ratio)) } - +``` +# Scatter-plot: elec vs income and log-income +```{r} library(ggplot2) plot_kmeans = function(data, k, log) { @@ -74,7 +87,9 @@ klog = elbow_wss(datalog) plot_kmeans(data, k, log=FALSE) plot_kmeans(data, klog, log=TRUE) - +``` +# Map of USA +```{r} library(maps) res = kmeans(data, centers = k) map_color = res$cluster[order(names(res$cluster))] @@ -94,6 +109,10 @@ klog = elbow_wss(datalog) plot_kmeans(data, k, log=FALSE) plot_kmeans(data, klog, log=TRUE) +``` +# Differences within dendrogramm algo: 'single', 'complete', 'ward.D', 'average' +```{r} + library(ggdendro) @@ -136,3 +155,4 @@ clust = hclust(distance, method = "single") plot(ggdendrogram(clust)) cutree(clust, k = 3) +``` diff --git a/ds/25-1/r/5.R b/ds/25-1/r/5.rmd similarity index 82% rename from ds/25-1/r/5.R rename to ds/25-1/r/5.rmd index 5772c6f..97bce23 100644 --- a/ds/25-1/r/5.R +++ b/ds/25-1/r/5.rmd @@ -1,3 +1,12 @@ +--- +title: "Lab5: Associative rules, Apriori" +author: "Vladislav Litvinov " +output: + pdf_document: + toc_float: TRUE +--- +# Histogram of transaction frequencies +```{r} setwd('/home/sek1ro/git/public/lab/ds/25-1/r') library(arules) library(arulesViz) @@ -10,7 +19,9 @@ ift = sort(itemFrequency(ts), decreasing = TRUE) (most_frequent_item = ift[1]) (max_ts_size = max(size(ts))) - +``` +# Model training. Rules computing +```{r} rules = apriori(ts, parameter = list(support = 0.01, confidence = 0)) length(rules) plot(rules, jitter = 0) @@ -18,7 +29,9 @@ plot(rules, jitter = 0) rules50 = apriori(ts, parameter = list(support = 0.01, confidence = 0.5)) length(rules50) plot(rules50, jitter = 0) - +``` +# Manual threshold applying: confidence = 0.5, plots comparsion +```{r} library(ggplot2) asc = function(q, colors = c("lightgray", "red")) { q = q[order(q$lift), ] @@ -39,9 +52,11 @@ asc(quality50, colors = c("navy", "cyan")) quality = as.data.frame(quality(rules)) asc(subset(quality, quality$confidence > 0.5)) - -plot(rules, measure = c("support", "lift"), engine = "interactive", shading = "confidence") -plot(rules, engine = "interactive") +``` +# Top 3 rules by lift. 'Relation' graph and matrix +```{r} +plot(rules, measure = c("support", "lift"), shading = "confidence") +plot(rules) filt_rules = rules[which(quality(rules)$confidence > 0.8)] quality = as.data.frame(quality(filt_rules)) @@ -55,8 +70,9 @@ plot(filt_rules, top3_rules = head(sort(filt_rules, by = "lift", decreasing = TRUE), 3) plot(top3_rules, method = "graph") - - +``` +# Random picking of train and test datasets +```{r} train_set = ts[1:8000] test_set = ts[8001:10000] @@ -84,3 +100,4 @@ plot(comparison$train_support, comparison$test_support, ylab = "test support", pch = 19) abline(0, 1, lty = 2) +``` \ No newline at end of file diff --git a/ds/25-1/r/6.R b/ds/25-1/r/6.rmd similarity index 74% rename from ds/25-1/r/6.R rename to ds/25-1/r/6.rmd index ddbb585..3379c01 100644 --- a/ds/25-1/r/6.R +++ b/ds/25-1/r/6.rmd @@ -1,3 +1,12 @@ +--- +title: "Lab6: Linear regression" +author: "Vladislav Litvinov " +output: + pdf_document: + toc_float: TRUE +--- +# Data preparation +```{r} setwd('/home/sek1ro/git/public/lab/ds/25-1/r') df = read.csv('zeta.csv') head(df) @@ -9,26 +18,31 @@ df = subset(df, 8 < meaneducation & meaneducation < 18 & log_income = log10(df$meanhouseholdincome) colnames(df) = c("age", "education", "employment", "householdincome") - +``` +# Linear regression graph +```{r} library(ggplot2) ggplot(df, aes(x = age, y = log_income)) + geom_point() + - geom_smooth(method = "lm") + geom_smooth(method = "lm") + theme_minimal() lmmod = lm(log_income ~ age, data = df) summary(lmmod) +``` -# xm = sum(xi) / n -# sd = sqrt(sum(xi - xm) / (n - 1)) -# mr = sd / sqrt(n): вариация -# t = (xm1 - xm2) / sqrt(mr1 ^ 2 + mr2 ^ 2) -# f = (n1 + n2) - 2: степень свободы -# p = 0,03 probability of obtaining test results at least as extreme as the result actually observed +# Formulas for Student's t-test and p-value +> xm = sum(xi) / n +sd = sqrt(sum(xi - xm) / (n - 1)) +mr = sd / sqrt(n) +t = (xm1 - xm2) / sqrt(mr1 ^ 2 + mr2 ^ 2) +f = (n1 + n2) - 2 +p = 0,03 probability of obtaining test results at least as extreme as the result actually observed +```{r} ggplot(df, aes(x = education, y = log_income)) + geom_point() + - geom_smooth(method = "lm") + geom_smooth(method = "lm") + theme_minimal() lmmod = lm(log_income ~ education, data = df) @@ -36,15 +50,17 @@ summary(lmmod) ggplot(df, aes(x = employment, y = log_income)) + geom_point() + - geom_smooth(method = "lm") -theme_minimal() + geom_smooth(method = "lm") + + theme_minimal() lmmod = lm(log_income ~ employment, data = df) summary(lmmod) lmmod = lm(householdincome ~ age + education + employment, data = df) summary(lmmod) - +``` +# Random test and train datasets slices +```{r} set.seed(Sys.Date()) test_idx = sample(1:nrow(df), 5000, replace = FALSE) test_df = df[test_idx, ] @@ -57,7 +73,9 @@ ggplot(test_df, aes(x = p_income, y = householdincome)) + geom_abline(intercept = 0, slope = 1) + geom_point()+ theme_minimal() - +``` +# Error measurements: MSE, RMSE, MAE, Bias +```{r} test_df = test_df[order(test_df$p_income), ] slice_n = 10 slice_size = floor(nrow(test_df) / slice_n) @@ -76,4 +94,4 @@ for (i in 0 : (slice_n - 1)) { rmse, mae, bias) ) } - +``` \ No newline at end of file diff --git a/ds/25-1/r/7.R b/ds/25-1/r/7.rmd similarity index 76% rename from ds/25-1/r/7.R rename to ds/25-1/r/7.rmd index 4f27def..3f9875b 100644 --- a/ds/25-1/r/7.R +++ b/ds/25-1/r/7.rmd @@ -1,3 +1,12 @@ +--- +title: "Lab7: Logistic regression" +author: "Vladislav Litvinov " +output: + pdf_document: + toc_float: TRUE +--- +# Data preparation +```{r} setwd('/home/sek1ro/git/public/lab/ds/25-1/r') survey <- read.csv('survey.csv') @@ -8,9 +17,10 @@ survey$price30 <- ifelse(survey$Price == 30, 1, 0) head(survey) survey$one <- 1 - -#https://stats.stackexchange.com/questions/48178/how-to-interpret-the-intercept-term-in-a-glm - +``` +# Model training +[Useful link 1](https://stats.stackexchange.com/questions/48178/how-to-interpret-the-intercept-term-in-a-glm) +```{r} model <- glm( MYDEPV ~ Income + Age + price20 + price30, binomial(link = "logit"), @@ -21,7 +31,9 @@ quantile(residuals(model)) #https://library.virginia.edu/data/articles/understanding-deviance-residuals #Residuals are the differences between what we observe and what our model predicts. #Residuals greater than the absolute value of 3 are in the tails of a standard normal distribution and usually indicate strain in the model. - +``` +# Getting coefficients +```{r} beta_income <- coef(model)["Income"] pct_income <- (exp(beta_income) - 1) * 100 pct_income @@ -29,7 +41,9 @@ pct_income beta_price30 <- coef(model)["price30"] pct_price30 <- (exp(beta_price30 * 20) - 1) * 100 pct_price30 - +``` +# Predicts for the model +```{r} survey$odds_ratio <- exp(predict(model)) survey$prediction <- survey$odds_ratio / (1 + survey$odds_ratio) head(survey) @@ -46,3 +60,4 @@ new_person <- data.frame( prob <- predict(model, new_person, type="response") prob +``` \ No newline at end of file diff --git a/ds/25-1/r/8.R b/ds/25-1/r/8.rmd similarity index 82% rename from ds/25-1/r/8.R rename to ds/25-1/r/8.rmd index bf8c333..8138631 100644 --- a/ds/25-1/r/8.R +++ b/ds/25-1/r/8.rmd @@ -1,18 +1,30 @@ +--- +title: "Lab8: Naive bayes classifier" +author: "Vladislav Litvinov " +output: + pdf_document: + toc_float: TRUE +--- +# Data splitting (test, train datasets) +```{r} setwd('/home/sek1ro/git/public/lab/ds/25-1/r') df = read.csv("nbtrain.csv", stringsAsFactors = TRUE) trdf = df[1:9010,] tedf = df[9011:10010,] remove(df) library(e1071) - +``` +# Model training +```{r} nb = naiveBayes(income ~ age + sex + educ, data = trdf, laplace = 1) # p(A|B)*p(B) = p(AB) = p(B|A)*p(A) # p(A|B) = p(B|A) * p(A) / p(B) # апостер = услов * априор / маргин nb$apriori / sum (nb$apriori) nb$tables - - +``` +# Model testing. Confidence total and confidence by class +```{r} pd = predict(nb, tedf) (conf_mat = table(Actual = tedf$income, Predicted = pd)) @@ -38,9 +50,9 @@ pd = predict(nb, tedf) (conf_mat = table(Actual = tedf$sex, Predicted = pd)) conf_tot(conf_mat) conf_class(conf_mat) - - - +``` +# Separated male and female +```{r} male = trdf[trdf$sex == "M", ] female = trdf[trdf$sex == "F", ] @@ -63,3 +75,4 @@ nbrandom = function() { set.seed(Sys.time()) nbrandom() +``` \ No newline at end of file