Compare commits
2 Commits
98e89cfea6
...
ba66cbd0ff
| Author | SHA1 | Date | |
|---|---|---|---|
| ba66cbd0ff | |||
| 5fd0ff7154 |
BIN
circuit/25-1/5/README.jpg
Normal file
BIN
circuit/25-1/5/README.jpg
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 169 KiB |
BIN
circuit/25-1/6/README.jpg
Normal file
BIN
circuit/25-1/6/README.jpg
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 229 KiB |
BIN
circuit/25-1/6/lab6.pdf
Normal file
BIN
circuit/25-1/6/lab6.pdf
Normal file
Binary file not shown.
BIN
circuit/25-1/dz/08.12.pdf
Normal file
BIN
circuit/25-1/dz/08.12.pdf
Normal file
Binary file not shown.
@ -11,7 +11,7 @@ if (any(is.na(df[,2]))) {
|
||||
mean(df$income)
|
||||
median(df$income, na.rm=TRUE)
|
||||
|
||||
plot(x=df$income, y=df$zipCode, xlab="income", ylab="zipCode")
|
||||
plot(y=df$income, x=df$zipCode, xlab="income", ylab="zipCode")
|
||||
df$incomelog = log10(df$income)
|
||||
hist(df$incomelog, breaks=80)
|
||||
print(min_incomelog <- log10(7e3))
|
||||
|
||||
@ -95,7 +95,44 @@ plot_kmeans(data, k, log=FALSE)
|
||||
plot_kmeans(data, klog, log=TRUE)
|
||||
|
||||
library(ggdendro)
|
||||
distance = dist(df, method = "euclidean")
|
||||
clust = hclust(distance, method = "complete")
|
||||
|
||||
|
||||
plot_hclust = function(df, linkage, k) {
|
||||
data = df[,c("income", "elec")]
|
||||
distance = dist(data, method = "euclidean")
|
||||
clust = hclust(distance, method = linkage)
|
||||
data$cluster = as.factor(cutree(clust, k = k))
|
||||
data$state = rownames(df)
|
||||
print(cutree(clust, k = k))
|
||||
print(data)
|
||||
plt = ggplot() +
|
||||
geom_point(
|
||||
data = data,
|
||||
aes(
|
||||
x = income,
|
||||
y = elec,
|
||||
color = cluster
|
||||
)
|
||||
) +
|
||||
geom_text(
|
||||
data = data,
|
||||
vjust = 1.5,
|
||||
size = 2,
|
||||
aes(
|
||||
x = income,
|
||||
y = elec,
|
||||
label = state
|
||||
)
|
||||
)
|
||||
theme_minimal()
|
||||
print(plt)
|
||||
}
|
||||
|
||||
plot_hclust(data, "average", 5)
|
||||
|
||||
distance = dist(data, method = "euclidean")
|
||||
|
||||
clust = hclust(distance, method = "single")
|
||||
plot(ggdendrogram(clust))
|
||||
cutree(clust, k = k)
|
||||
|
||||
cutree(clust, k = 3)
|
||||
|
||||
86
ds/25-1/r/5.R
Normal file
86
ds/25-1/r/5.R
Normal file
@ -0,0 +1,86 @@
|
||||
setwd('/home/sek1ro/git/public/lab/ds/25-1/r')
|
||||
library(arules)
|
||||
library(arulesViz)
|
||||
ts = read.transactions("AssociationRules.csv",
|
||||
sep = " ",
|
||||
rm.duplicates = TRUE)
|
||||
|
||||
itemFrequencyPlot(ts, type = "absolute", topN = 10)
|
||||
ift = sort(itemFrequency(ts), decreasing = TRUE)
|
||||
|
||||
(most_frequent_item = ift[1])
|
||||
(max_ts_size = max(size(ts)))
|
||||
|
||||
rules = apriori(ts, parameter = list(support = 0.01, confidence = 0))
|
||||
length(rules)
|
||||
plot(rules, jitter = 0)
|
||||
|
||||
rules50 = apriori(ts, parameter = list(support = 0.01, confidence = 0.5))
|
||||
length(rules50)
|
||||
plot(rules50, jitter = 0)
|
||||
|
||||
library(ggplot2)
|
||||
asc = function(q, colors = c("lightgray", "red")) {
|
||||
q = q[order(q$lift), ]
|
||||
ggplot(q, aes(x = support, y = confidence, color = lift)) +
|
||||
geom_point() +
|
||||
ylim(0, 1) +
|
||||
xlim(0, 0.5) +
|
||||
theme_minimal() +
|
||||
scale_color_gradientn(
|
||||
colors = colors,
|
||||
name = "Lift",
|
||||
limits = c(min(q$lift), max(q$lift))
|
||||
)
|
||||
}
|
||||
|
||||
quality50 = as.data.frame(quality(rules50))
|
||||
asc(quality50, colors = c("navy", "cyan"))
|
||||
|
||||
quality = as.data.frame(quality(rules))
|
||||
asc(subset(quality, quality$confidence > 0.5))
|
||||
|
||||
plot(rules, measure = c("support", "lift"), engine = "interactive", shading = "confidence")
|
||||
plot(rules, engine = "interactive")
|
||||
|
||||
filt_rules = rules[which(quality(rules)$confidence > 0.8)]
|
||||
quality = as.data.frame(quality(filt_rules))
|
||||
quality = quality[order(-quality$lift),]
|
||||
tail(quality, 10)
|
||||
|
||||
plot(filt_rules,
|
||||
method = "matrix",
|
||||
shading = c("lift", "confidence"),
|
||||
engine = "grid")
|
||||
|
||||
top3_rules = head(sort(filt_rules, by = "lift", decreasing = TRUE), 3)
|
||||
plot(top3_rules, method = "graph")
|
||||
|
||||
|
||||
train_set = ts[1:8000]
|
||||
test_set = ts[8001:10000]
|
||||
|
||||
train_rules = apriori(train_set, parameter = list(support = 0.01, confidence = 0.5))
|
||||
test_quality = interestMeasure(train_rules,
|
||||
measure = c("support", "confidence", "lift", "coverage"),
|
||||
transactions = test_set)
|
||||
comparison <- data.frame(
|
||||
train_support = quality(train_rules)$support[1:10],
|
||||
test_support = test_quality$support[1:10],
|
||||
train_lift = quality(train_rules)$lift[1:10],
|
||||
test_lift = test_quality$lift[1:10]
|
||||
)
|
||||
|
||||
print(comparison)
|
||||
plot(comparison$train_lift, comparison$test_lift,
|
||||
xlab = "train lift",
|
||||
ylab = "test lift",
|
||||
pch = 19)
|
||||
abline(0, 1, lty = 2)
|
||||
|
||||
# График для support
|
||||
plot(comparison$train_support, comparison$test_support,
|
||||
xlab = "train support",
|
||||
ylab = "test support",
|
||||
pch = 19)
|
||||
abline(0, 1, lty = 2)
|
||||
65
ds/25-1/r/8.R
Normal file
65
ds/25-1/r/8.R
Normal file
@ -0,0 +1,65 @@
|
||||
setwd('/home/sek1ro/git/public/lab/ds/25-1/r')
|
||||
df = read.csv("nbtrain.csv", stringsAsFactors = TRUE)
|
||||
trdf = df[1:9010,]
|
||||
tedf = df[9011:10010,]
|
||||
remove(df)
|
||||
library(e1071)
|
||||
|
||||
nb = naiveBayes(income ~ age + sex + educ, data = trdf, laplace = 1)
|
||||
# p(A|B)*p(B) = p(AB) = p(B|A)*p(A)
|
||||
# p(A|B) = p(B|A) * p(A) / p(B)
|
||||
# апостер = услов * априор / маргин
|
||||
nb$apriori / sum (nb$apriori)
|
||||
nb$tables
|
||||
|
||||
|
||||
pd = predict(nb, tedf)
|
||||
(conf_mat = table(Actual = tedf$income, Predicted = pd))
|
||||
|
||||
conf_tot = function(conf_mat) {
|
||||
cat(1 - (sum(diag(conf_mat)) / sum(conf_mat)))
|
||||
}
|
||||
|
||||
conf_class = function(conf_mat) {
|
||||
for (income in rownames(conf_mat)) {
|
||||
err = 1 - (conf_mat[income, income] / sum(conf_mat[income, ]))
|
||||
cat(sprintf("%s error %.2f%%\n", income, err * 100))
|
||||
}
|
||||
}
|
||||
|
||||
conf_tot(conf_mat)
|
||||
conf_class(conf_mat)
|
||||
|
||||
nb = naiveBayes(sex ~ age + educ + income, data = trdf, laplace = 1)
|
||||
nb$apriori / sum (nb$apriori)
|
||||
nb$tables
|
||||
|
||||
pd = predict(nb, tedf)
|
||||
(conf_mat = table(Actual = tedf$sex, Predicted = pd))
|
||||
conf_tot(conf_mat)
|
||||
conf_class(conf_mat)
|
||||
|
||||
|
||||
|
||||
male = trdf[trdf$sex == "M", ]
|
||||
female = trdf[trdf$sex == "F", ]
|
||||
|
||||
nbrandom = function() {
|
||||
mdf = male[sample(1:nrow(male), 3500),]
|
||||
fdf = female[sample(1:nrow(female), 3500), ]
|
||||
|
||||
mfdf = rbind(mdf, fdf)
|
||||
|
||||
mfnb = naiveBayes(sex ~ age + educ + income, data = mfdf, laplace = 1)
|
||||
mfnb$apriori / sum (mfnb$apriori)
|
||||
mfnb$tables
|
||||
|
||||
mfpd = predict(mfnb, tedf)
|
||||
(mfconf_mat = table(Actual = tedf$sex, Predicted = mfpd))
|
||||
conf_tot(mfconf_mat)
|
||||
conf_class(mfconf_mat)
|
||||
}
|
||||
|
||||
set.seed(Sys.time())
|
||||
nbrandom()
|
||||
|
||||
10000
ds/25-1/r/AssociationRules.csv
Normal file
10000
ds/25-1/r/AssociationRules.csv
Normal file
File diff suppressed because it is too large
Load Diff
10011
ds/25-1/r/nbtrain.csv
Normal file
10011
ds/25-1/r/nbtrain.csv
Normal file
File diff suppressed because it is too large
Load Diff
Reference in New Issue
Block a user