Documente Academic
Documente Profesional
Documente Cultură
## V1 V2 V3 V4 V5 V6 V7
## 1 vhigh vhigh 2 2 small low unacc
## 2 vhigh vhigh 2 2 small med unacc
## 3 vhigh vhigh 2 2 small high unacc
## 4 vhigh vhigh 2 2 med low unacc
## 5 vhigh vhigh 2 2 med med unacc
## 6 vhigh vhigh 2 2 med high unacc
Data Slicing
set.seed(3033)
intrain <- createDataPartition(y = car_df$V7, p= 0.7, list = FALSE)
training <- car_df[intrain,]
testing <- car_df[-intrain,]
1
# check dimensions of train & test set
dim(training)
## [1] 1211 7
dim(testing)
## [1] 517 7
## [1] FALSE
summary(car_df)
## V1 V2 V3 V4 V5 V6
## high :432 high :432 2 :432 2 :576 big :576 high:576
## low :432 low :432 3 :432 4 :576 med :576 low :576
## med :432 med :432 4 :432 more:576 small:576 med :576
## vhigh:432 vhigh:432 5more:432
## V7
## acc : 384
## good : 69
## unacc:1210
## vgood: 65
## CART
##
## 1211 samples
## 6 predictor
## 4 classes: 'acc', 'good', 'unacc', 'vgood'
##
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 3 times)
2
## Summary of sample sizes: 1091, 1090, 1091, 1089, 1089, 1089, ...
## Resampling results across tuning parameters:
##
## cp Accuracy Kappa
## 0.01098901 0.8580406 0.6994059
## 0.01373626 0.8249997 0.6220574
## 0.01510989 0.8159311 0.6034018
## 0.01556777 0.8084860 0.5840711
## 0.01648352 0.8065620 0.5775304
## 0.01831502 0.7979943 0.5577698
## 0.02060440 0.7941443 0.5489378
## 0.02197802 0.7922137 0.5480683
## 0.06868132 0.7883246 0.5726618
## 0.09340659 0.7233582 0.2223118
##
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was cp = 0.01098901.
yes V6low = 0 no
unacc
V44 = 1
V4more = 1
V2vhigh = 0
V1vhigh = 0 unacc
V1low = 0 V5small = 0
V6med = 1 unacc unacc
V1med = 0 V1med = 1 V1vhigh = 0
V2low = 0 V5small = 1 V1low = 1 V2med = 1
V6med = 0 acc acc V6med = 1
V5small = 0 good vgood unacc V2vhigh = 1 V2low = 1
acc acc good acc V2low = 0 acc
unacc good V1low = 0 unacc
acc acc acc acc
vgood
V1med = 0
vgood
acc
# Prediction
testing[1,]
## V1 V2 V3 V4 V5 V6 V7
3
## 6 vhigh vhigh 2 2 med high unacc
predict(dtree_fit, newdata = testing[1,])
## [1] unacc
## Levels: acc good unacc vgood
test_pred <- predict(dtree_fit, newdata = testing)
confusionMatrix(test_pred, testing$V7 ) #check accuracy
## CART
##
## 1211 samples
## 6 predictor
## 4 classes: 'acc', 'good', 'unacc', 'vgood'
4
##
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 3 times)
## Summary of sample sizes: 1091, 1090, 1091, 1089, 1089, 1089, ...
## Resampling results across tuning parameters:
##
## cp Accuracy Kappa
## 0.01098901 0.8522395 0.6816530
## 0.01373626 0.8362745 0.6436379
## 0.01510989 0.8305029 0.6305745
## 0.01556777 0.8249840 0.6168644
## 0.01648352 0.8227709 0.6115286
## 0.01831502 0.8180553 0.6039963
## 0.02060440 0.8095423 0.5858712
## 0.02197802 0.8032220 0.5725628
## 0.06868132 0.7888755 0.5727260
## 0.09340659 0.7233582 0.2223118
##
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was cp = 0.01098901.
yes V6low = 0 no
unacc
V44 = 1
V4more = 1
V2vhigh = 0
V1vhigh = 0 unacc
V1low = 0 V5small = 0
V6med = 1 unacc unacc
V6med = 0 V1med = 1 V1vhigh = 0
V5small = 0 V5small = 1 V1low = 1 V2med = 1
acc acc acc V2low = 0
unacc vgood unacc V6med = 1 V2low = 1
acc good acc acc acc
vgood unacc
good acc
# Prediction
5
test_pred_gini <- predict(dtree_fit_gini, newdata = testing)
confusionMatrix(test_pred_gini, testing$V7 ) #check accuracy