a <- 3 v <- c(1,2,3,4,5) q() a <- 3 v<-c(1,2,3,4,5) read.table("git/public/lab/ds/25-1/r2/zipIncome.txt") db = read.table("git/public/lab/ds/25-1/r2/zipIncome.txt") levels<-c(1,2,3) ratings<-c("a", "b", "c") f<-factor(ratings, levels) print(f) print(f.size()) print(f) print(levels.default()) print(levels[0]) print(levels[1]) print(ratings[1]) print(ratings[2]) print(ratings[0]) print(ratings[4]) print(ratings[3]) levels<-("a", "b", "a", "c") levels<-("a", "b", "a", "c") levels<-c("a", "b", "a", "c") f<-factor(ratings, levels) tmp<-levels levels<-ratings ratings<-tmp remove(tmp) f<-factor(ratings, levels) f[0] f[1] f[2] f[3] f[4] f[5] pi e exp exp() v <- c(1:10) w <- c(15:24) x <- v x = v x <- v * w db[1:3] db[1:2] db[1] db[1,] db[1:2,] db[2,] db[1,] db[,1] db$state db db$V1 db$q db[,1] db[1,] db$zip_prefixes db$name db["zip_prefixes"] df <- data.frame(name=c("a")) fd df df$name df[] df[1] df[,1] df[,2] df[1,] class(df) printd print(d) summary(x) var(x) sd(x) range(x) plot(x) hist(x) str(x) help(plot) setwd('/home/sek1ro/git/public/lab/ds/25-1/r') survey <- read.csv('survey.csv') head(survey) survey$price20 <- ifelse(survey$Price == 20, 1, 0) survey$price30 <- ifelse(survey$Price == 30, 1, 0) head(survey) survey$one <- 1 model <- glm( MYDEPV ~ Income + Age + price20 + price30, binomial(link = "logit"), survey ) summary(model) quantile(residuals(model)) pred_test = predict(tree, test_df, type="class") setwd('/home/sek1ro/git/public/lab/ds/25-1/r') survey <- read.csv('survey.csv') train_df = survey[1:600,] test_df = survey[601:750,] setwd('/home/sek1ro/git/public/lab/ds/25-1/r') survey <- read.csv('survey.csv') train_df = survey[1:600,] test_df = survey[601:750,] library(rpart) tree = rpart( MYDEPV ~ Price + Income + Age, data = train_df, method = "class", parms = list(split = "information"), control = rpart.control( xval = 3, ), ) printcp(tree) library(rpart.plot) rpart.plot( tree, type = 1, extra = 106, #6 Class models: the probability of the second class only. Useful for binary responses. #100 display the percentage of observations in the node. fallen.leaves = TRUE, ) pred_class = predict(tree, train_df, type="class") conf_mat = table( Actual = train_df$MYDEPV, Predicted = pred_class ) conf_mat print(diag(conf_mat) / rowSums(conf_mat)) print(1 - sum(diag(conf_mat)) / sum(conf_mat)) pred_prob = predict(tree, train_df, type="prob")[,2] library(ROCR) pred = prediction(pred_prob, train_df$MYDEPV) perf = performance(pred, "tpr", "fpr") plot(perf) abline(a = 0, b = 1) auc_perf = performance(pred, measure = "auc") auc_perf@y.values[[1]] pred_test = predict(tree, test_df, type="class") conf_mat_test = table(Actual = test_df$MYDEPV, Predicted = pred_test) conf_mat_test print(diag(conf_mat) / rowSums(conf_mat)) tree_gini = rpart( MYDEPV ~ Price + Income + Age, data = train_df, method = "class", parms = list(split = "gini") ) printcp(tree_gini) rpart.plot( tree_gini, type = 1, extra = 106, fallen.leaves = TRUE, ) best_cp <- tree$cptable[which.min(tree_dini$cptable[, "xerror"]), "CP"] best_cp <- tree_dini$cptable[which.min(tree_dini$cptable[, "xerror"]), "CP"] best_cp <- tree_gini$cptable[which.min(tree_gini$cptable[, "xerror"]), "CP"] best_cp best_cp <- tree_gini$cptable[which.min(tree_gini$cptable[, "xerror"]), "CP"] best_cp pruned_tree = prune(tree_gini, cp = best_cp) best_cp <- tree_gini$cptable[which.min(tree_gini$cptable[, "xerror"]), "CP"] best_cp pruned_tree = prune(tree_gini, cp = best_cp) printcp(pruned_tree) best_cp <- tree_gini$cptable[which.min(tree_gini$cptable[, "xerror"]), "CP"] best_cp pruned_tree = prune(tree_gini, cp = best_cp) printcp(pruned_tree) rpart.plot(pruned_tree) pred_test = predict(tree, test_df, type="class") conf_mat_test = table(Actual = test_df$MYDEPV, Predicted = pred_test) conf_mat_test print(diag(conf_mat_test) / rowSums(conf_mat_test)) tree_gini = rpart( MYDEPV ~ Price + Income + Age, data = train_df, method = "class", parms = list(split = "gini") ) printcp(tree_gini) rpart.plot( tree_gini, type = 1, extra = 106, fallen.leaves = TRUE, ) best_cp <- tree_gini$cptable[which.min(tree_gini$cptable[, "xerror"]), "CP"] best_cp pruned_tree = prune(tree_gini, cp = best_cp) printcp(pruned_tree) rpart.plot(pruned_tree) pruned_pred = predict(pruned_tree, test_df, type="class") pruned_conf_mat = table(Actual = test_df$MYDEPV, Predicted = pruned_pred) pruned_conf_mat print(diag(pruned_conf_mat) / rowSums(pruned_conf_mat)) pred_test = predict(tree, test_df, type="class") conf_mat_test = table(Actual = test_df$MYDEPV, Predicted = pred_test) conf_mat_test print(diag(conf_mat_test) / rowSums(conf_mat_test)) tree_gini = rpart( MYDEPV ~ Price + Income + Age, data = train_df, method = "class", parms = list(split = "gini") ) printcp(tree_gini) rpart.plot( tree_gini, type = 1, extra = 106, fallen.leaves = TRUE, ) best_cp <- tree_gini$cptable[which.min(tree_gini$cptable[, "xerror"]), "CP"] best_cp pruned_tree = prune(tree_gini, cp = best_cp) printcp(pruned_tree) rpart.plot(pruned_tree) pruned_pred = predict(pruned_tree, test_df, type="class") pruned_conf_mat = table(Actual = test_df$MYDEPV, Predicted = pruned_pred) pruned_conf_mat print(diag(pruned_conf_mat) / rowSums(pruned_conf_mat))