Compare commits

...

2 Commits

Author SHA1 Message Date
ba66cbd0ff ds: 2r, 4r, 5r, 8r 2025-12-11 14:58:04 +03:00
5fd0ff7154 circuit: 5, 6 2025-12-11 14:57:49 +03:00
10 changed files with 20203 additions and 4 deletions

BIN
circuit/25-1/5/README.jpg Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 169 KiB

BIN
circuit/25-1/6/README.jpg Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 229 KiB

BIN
circuit/25-1/6/lab6.pdf Normal file

Binary file not shown.

BIN
circuit/25-1/dz/08.12.pdf Normal file

Binary file not shown.

View File

@ -11,7 +11,7 @@ if (any(is.na(df[,2]))) {
mean(df$income)
median(df$income, na.rm=TRUE)
plot(x=df$income, y=df$zipCode, xlab="income", ylab="zipCode")
plot(y=df$income, x=df$zipCode, xlab="income", ylab="zipCode")
df$incomelog = log10(df$income)
hist(df$incomelog, breaks=80)
print(min_incomelog <- log10(7e3))

View File

@ -95,7 +95,44 @@ plot_kmeans(data, k, log=FALSE)
plot_kmeans(data, klog, log=TRUE)
library(ggdendro)
distance = dist(df, method = "euclidean")
clust = hclust(distance, method = "complete")
plot_hclust = function(df, linkage, k) {
data = df[,c("income", "elec")]
distance = dist(data, method = "euclidean")
clust = hclust(distance, method = linkage)
data$cluster = as.factor(cutree(clust, k = k))
data$state = rownames(df)
print(cutree(clust, k = k))
print(data)
plt = ggplot() +
geom_point(
data = data,
aes(
x = income,
y = elec,
color = cluster
)
) +
geom_text(
data = data,
vjust = 1.5,
size = 2,
aes(
x = income,
y = elec,
label = state
)
)
theme_minimal()
print(plt)
}
plot_hclust(data, "average", 5)
distance = dist(data, method = "euclidean")
clust = hclust(distance, method = "single")
plot(ggdendrogram(clust))
cutree(clust, k = k)
cutree(clust, k = 3)

86
ds/25-1/r/5.R Normal file
View File

@ -0,0 +1,86 @@
setwd('/home/sek1ro/git/public/lab/ds/25-1/r')
library(arules)
library(arulesViz)
ts = read.transactions("AssociationRules.csv",
sep = " ",
rm.duplicates = TRUE)
itemFrequencyPlot(ts, type = "absolute", topN = 10)
ift = sort(itemFrequency(ts), decreasing = TRUE)
(most_frequent_item = ift[1])
(max_ts_size = max(size(ts)))
rules = apriori(ts, parameter = list(support = 0.01, confidence = 0))
length(rules)
plot(rules, jitter = 0)
rules50 = apriori(ts, parameter = list(support = 0.01, confidence = 0.5))
length(rules50)
plot(rules50, jitter = 0)
library(ggplot2)
asc = function(q, colors = c("lightgray", "red")) {
q = q[order(q$lift), ]
ggplot(q, aes(x = support, y = confidence, color = lift)) +
geom_point() +
ylim(0, 1) +
xlim(0, 0.5) +
theme_minimal() +
scale_color_gradientn(
colors = colors,
name = "Lift",
limits = c(min(q$lift), max(q$lift))
)
}
quality50 = as.data.frame(quality(rules50))
asc(quality50, colors = c("navy", "cyan"))
quality = as.data.frame(quality(rules))
asc(subset(quality, quality$confidence > 0.5))
plot(rules, measure = c("support", "lift"), engine = "interactive", shading = "confidence")
plot(rules, engine = "interactive")
filt_rules = rules[which(quality(rules)$confidence > 0.8)]
quality = as.data.frame(quality(filt_rules))
quality = quality[order(-quality$lift),]
tail(quality, 10)
plot(filt_rules,
method = "matrix",
shading = c("lift", "confidence"),
engine = "grid")
top3_rules = head(sort(filt_rules, by = "lift", decreasing = TRUE), 3)
plot(top3_rules, method = "graph")
train_set = ts[1:8000]
test_set = ts[8001:10000]
train_rules = apriori(train_set, parameter = list(support = 0.01, confidence = 0.5))
test_quality = interestMeasure(train_rules,
measure = c("support", "confidence", "lift", "coverage"),
transactions = test_set)
comparison <- data.frame(
train_support = quality(train_rules)$support[1:10],
test_support = test_quality$support[1:10],
train_lift = quality(train_rules)$lift[1:10],
test_lift = test_quality$lift[1:10]
)
print(comparison)
plot(comparison$train_lift, comparison$test_lift,
xlab = "train lift",
ylab = "test lift",
pch = 19)
abline(0, 1, lty = 2)
# График для support
plot(comparison$train_support, comparison$test_support,
xlab = "train support",
ylab = "test support",
pch = 19)
abline(0, 1, lty = 2)

65
ds/25-1/r/8.R Normal file
View File

@ -0,0 +1,65 @@
setwd('/home/sek1ro/git/public/lab/ds/25-1/r')
df = read.csv("nbtrain.csv", stringsAsFactors = TRUE)
trdf = df[1:9010,]
tedf = df[9011:10010,]
remove(df)
library(e1071)
nb = naiveBayes(income ~ age + sex + educ, data = trdf, laplace = 1)
# p(A|B)*p(B) = p(AB) = p(B|A)*p(A)
# p(A|B) = p(B|A) * p(A) / p(B)
# апостер = услов * априор / маргин
nb$apriori / sum (nb$apriori)
nb$tables
pd = predict(nb, tedf)
(conf_mat = table(Actual = tedf$income, Predicted = pd))
conf_tot = function(conf_mat) {
cat(1 - (sum(diag(conf_mat)) / sum(conf_mat)))
}
conf_class = function(conf_mat) {
for (income in rownames(conf_mat)) {
err = 1 - (conf_mat[income, income] / sum(conf_mat[income, ]))
cat(sprintf("%s error %.2f%%\n", income, err * 100))
}
}
conf_tot(conf_mat)
conf_class(conf_mat)
nb = naiveBayes(sex ~ age + educ + income, data = trdf, laplace = 1)
nb$apriori / sum (nb$apriori)
nb$tables
pd = predict(nb, tedf)
(conf_mat = table(Actual = tedf$sex, Predicted = pd))
conf_tot(conf_mat)
conf_class(conf_mat)
male = trdf[trdf$sex == "M", ]
female = trdf[trdf$sex == "F", ]
nbrandom = function() {
mdf = male[sample(1:nrow(male), 3500),]
fdf = female[sample(1:nrow(female), 3500), ]
mfdf = rbind(mdf, fdf)
mfnb = naiveBayes(sex ~ age + educ + income, data = mfdf, laplace = 1)
mfnb$apriori / sum (mfnb$apriori)
mfnb$tables
mfpd = predict(mfnb, tedf)
(mfconf_mat = table(Actual = tedf$sex, Predicted = mfpd))
conf_tot(mfconf_mat)
conf_class(mfconf_mat)
}
set.seed(Sys.time())
nbrandom()

10000
ds/25-1/r/AssociationRules.csv Normal file

File diff suppressed because it is too large Load Diff

10011
ds/25-1/r/nbtrain.csv Normal file

File diff suppressed because it is too large Load Diff