lab/5/data science/r/.Rhistory

a <- 3
v <- c(1,2,3,4,5)
q()
a <- 3
v<-c(1,2,3,4,5)
read.table("git/public/lab/ds/25-1/r2/zipIncome.txt")
db = read.table("git/public/lab/ds/25-1/r2/zipIncome.txt")
levels<-c(1,2,3)
ratings<-c("a", "b", "c")
f<-factor(ratings, levels)
print(f)
print(f.size())
print(f)
print(levels.default())
print(levels[0])
print(levels[1])
print(ratings[1])
print(ratings[2])
print(ratings[0])
print(ratings[4])
print(ratings[3])
levels<-("a", "b", "a", "c")
levels<-("a", "b", "a", "c")
levels<-c("a", "b", "a", "c")
f<-factor(ratings, levels)
tmp<-levels
levels<-ratings
ratings<-tmp
remove(tmp)
f<-factor(ratings, levels)
f[0]
f[1]
f[2]
f[3]
f[4]
f[5]
pi
e
exp
exp()
v <- c(1:10)
w <- c(15:24)
x <- v
x = v
x <- v * w
db[1:3]
db[1:2]
db[1]
db[1,]
db[1:2,]
db[2,]
db[1,]
db[,1]
db$state
db
db$V1
db$q
db[,1]
db[1,]
db$zip_prefixes
db$name
db["zip_prefixes"]
df <- data.frame(name=c("a"))
fd
df
df$name
df[]
df[1]
df[,1]
df[,2]
df[1,]
class(df)
printd
print(d)
summary(x)
var(x)
sd(x)
range(x)
plot(x)
hist(x)
str(x)
help(plot)
setwd('/home/sek1ro/git/public/lab/ds/25-1/r')
survey <- read.csv('survey.csv')
head(survey)
survey$price20 <- ifelse(survey$Price == 20, 1, 0)
survey$price30 <- ifelse(survey$Price == 30, 1, 0)
head(survey)
survey$one <- 1
model <- glm(
MYDEPV ~ Income + Age + price20 + price30,
binomial(link = "logit"),
survey
)
summary(model)
quantile(residuals(model))
pred_test = predict(tree, test_df, type="class")
setwd('/home/sek1ro/git/public/lab/ds/25-1/r')
survey <- read.csv('survey.csv')
train_df = survey[1:600,]
test_df = survey[601:750,]
setwd('/home/sek1ro/git/public/lab/ds/25-1/r')
survey <- read.csv('survey.csv')
train_df = survey[1:600,]
test_df = survey[601:750,]
library(rpart)
tree = rpart(
MYDEPV ~ Price + Income + Age,
data = train_df,
method = "class",
parms = list(split = "information"),
control = rpart.control(
xval = 3,
),
)
printcp(tree)
library(rpart.plot)
rpart.plot(
tree,
type = 1,
extra = 106,
#6 Class models: the probability of the second class only. Useful for binary responses.
#100 display the percentage of observations in the node.
fallen.leaves = TRUE,
)
pred_class = predict(tree, train_df, type="class")
conf_mat = table(
Actual = train_df$MYDEPV,
Predicted = pred_class
)
conf_mat
print(diag(conf_mat) / rowSums(conf_mat))
print(1 - sum(diag(conf_mat)) / sum(conf_mat))
pred_prob = predict(tree, train_df, type="prob")[,2]
library(ROCR)
pred = prediction(pred_prob, train_df$MYDEPV)
perf = performance(pred, "tpr", "fpr")
plot(perf)
abline(a = 0, b = 1)
auc_perf = performance(pred, measure = "auc")
auc_perf@y.values[[1]]
pred_test = predict(tree, test_df, type="class")
conf_mat_test = table(Actual = test_df$MYDEPV, Predicted = pred_test)
conf_mat_test
print(diag(conf_mat) / rowSums(conf_mat))
tree_gini = rpart(
MYDEPV ~ Price + Income + Age,
data = train_df,
method = "class",
parms = list(split = "gini")
)
printcp(tree_gini)
rpart.plot(
tree_gini,
type = 1,
extra = 106,
fallen.leaves = TRUE,
)
best_cp <- tree$cptable[which.min(tree_dini$cptable[, "xerror"]), "CP"]
best_cp <- tree_dini$cptable[which.min(tree_dini$cptable[, "xerror"]), "CP"]
best_cp <- tree_gini$cptable[which.min(tree_gini$cptable[, "xerror"]), "CP"]
best_cp
best_cp <- tree_gini$cptable[which.min(tree_gini$cptable[, "xerror"]), "CP"]
best_cp
pruned_tree = prune(tree_gini, cp = best_cp)
best_cp <- tree_gini$cptable[which.min(tree_gini$cptable[, "xerror"]), "CP"]
best_cp
pruned_tree = prune(tree_gini, cp = best_cp)
printcp(pruned_tree)
best_cp <- tree_gini$cptable[which.min(tree_gini$cptable[, "xerror"]), "CP"]
best_cp
pruned_tree = prune(tree_gini, cp = best_cp)
printcp(pruned_tree)
rpart.plot(pruned_tree)
pred_test = predict(tree, test_df, type="class")
conf_mat_test = table(Actual = test_df$MYDEPV, Predicted = pred_test)
conf_mat_test
print(diag(conf_mat_test) / rowSums(conf_mat_test))
tree_gini = rpart(
MYDEPV ~ Price + Income + Age,
data = train_df,
method = "class",
parms = list(split = "gini")
)
printcp(tree_gini)
rpart.plot(
tree_gini,
type = 1,
extra = 106,
fallen.leaves = TRUE,
)
best_cp <- tree_gini$cptable[which.min(tree_gini$cptable[, "xerror"]), "CP"]
best_cp
pruned_tree = prune(tree_gini, cp = best_cp)
printcp(pruned_tree)
rpart.plot(pruned_tree)
pruned_pred = predict(pruned_tree, test_df, type="class")
pruned_conf_mat = table(Actual = test_df$MYDEPV, Predicted = pruned_pred)
pruned_conf_mat
print(diag(pruned_conf_mat) / rowSums(pruned_conf_mat))
pred_test = predict(tree, test_df, type="class")
conf_mat_test = table(Actual = test_df$MYDEPV, Predicted = pred_test)
conf_mat_test
print(diag(conf_mat_test) / rowSums(conf_mat_test))
tree_gini = rpart(
MYDEPV ~ Price + Income + Age,
data = train_df,
method = "class",
parms = list(split = "gini")
)
printcp(tree_gini)
rpart.plot(
tree_gini,
type = 1,
extra = 106,
fallen.leaves = TRUE,
)
best_cp <- tree_gini$cptable[which.min(tree_gini$cptable[, "xerror"]), "CP"]
best_cp
pruned_tree = prune(tree_gini, cp = best_cp)
printcp(pruned_tree)
rpart.plot(pruned_tree)
pruned_pred = predict(pruned_tree, test_df, type="class")
pruned_conf_mat = table(Actual = test_df$MYDEPV, Predicted = pruned_pred)
pruned_conf_mat
print(diag(pruned_conf_mat) / rowSums(pruned_conf_mat))