Files
lab/ds/25-1/r/8.rmd
2025-12-27 22:36:25 +03:00

78 lines
1.8 KiB
Plaintext

---
title: "Lab8: Naive bayes classifier"
author: "Vladislav Litvinov <vlad@sek1ro>"
output:
pdf_document:
toc_float: TRUE
---
# Data splitting (test, train datasets)
```{r}
setwd('/home/sek1ro/git/public/lab/ds/25-1/r')
df = read.csv("nbtrain.csv", stringsAsFactors = TRUE)
trdf = df[1:9010,]
tedf = df[9011:10010,]
remove(df)
library(e1071)
```
# Model training
```{r}
nb = naiveBayes(income ~ age + sex + educ, data = trdf, laplace = 1)
# p(A|B)*p(B) = p(AB) = p(B|A)*p(A)
# p(A|B) = p(B|A) * p(A) / p(B)
# апостер = услов * априор / маргин
nb$apriori / sum (nb$apriori)
nb$tables
```
# Model testing. Confidence total and confidence by class
```{r}
pd = predict(nb, tedf)
(conf_mat = table(Actual = tedf$income, Predicted = pd))
conf_tot = function(conf_mat) {
cat(1 - (sum(diag(conf_mat)) / sum(conf_mat)))
}
conf_class = function(conf_mat) {
for (income in rownames(conf_mat)) {
err = 1 - (conf_mat[income, income] / sum(conf_mat[income, ]))
cat(sprintf("%s error %.2f%%\n", income, err * 100))
}
}
conf_tot(conf_mat)
conf_class(conf_mat)
nb = naiveBayes(sex ~ age + educ + income, data = trdf, laplace = 1)
nb$apriori / sum (nb$apriori)
nb$tables
pd = predict(nb, tedf)
(conf_mat = table(Actual = tedf$sex, Predicted = pd))
conf_tot(conf_mat)
conf_class(conf_mat)
```
# Separated male and female
```{r}
male = trdf[trdf$sex == "M", ]
female = trdf[trdf$sex == "F", ]
nbrandom = function() {
mdf = male[sample(1:nrow(male), 3500),]
fdf = female[sample(1:nrow(female), 3500), ]
mfdf = rbind(mdf, fdf)
mfnb = naiveBayes(sex ~ age + educ + income, data = mfdf, laplace = 1)
mfnb$apriori / sum (mfnb$apriori)
mfnb$tables
mfpd = predict(mfnb, tedf)
(mfconf_mat = table(Actual = tedf$sex, Predicted = mfpd))
conf_tot(mfconf_mat)
conf_class(mfconf_mat)
}
set.seed(Sys.time())
nbrandom()
```