DATA CLASSIFICATION USING SUPERVISED LEARNING (DATA MINING

DATA CLASSIFICATION USING SUPERVISED
LEARNING METHOD (DATA MINING)
By :
Fadhlurrohman Henriwan
4SK1 (09)
NIM : 16.9112
POLITEKNIK STATISTIKA STIS

JAKARTA 2019
Data Mining Classification
Fadhlurrohman Henriwan, 4SK1
Description
The CIA Factbook has geographic, demographic, and economic data on a country-by-country
basis. In the description of the variables, the 4-digit number indicates the code used to specify that
variable on the data and documentation web site. For instance,
https://www.cia.gov/library/publications/the-world-factbook/fields/2153.html contains
documentation for variable code 2153, network users.
Data
Researchers focus on several variables. Variables are including:
Fert  as dependent variable
Children born/woman (#/person), 2127
Pop
Number of people, 2119
Birth
Birth rate (#/1000), 2054
Death
Death rate (#/1000), 2066
Infant
Infant deaths per 1000 live births. 2091
Life
Life expectancy (years), 2102
Labor
Labor force (people), 2095
Tax
Taxes and other revenues (% of GDP), 2221
Imports
Imports ($), 2087
Gold
Reserves of foreign exchange and gold ($), 2188
Mainlines
Telephones - main lines in use (mainlines in use), 2150
library(class)
## Warning: package 'class' was built under R version 3.5.3
library(nnet)
## Warning: package 'nnet' was built under R version 3.5.3
library(caret)
## Warning: package 'caret' was built under R version 3.5.3
## Loading required package: lattice
## Loading required package: ggplot2
## Warning: package 'ggplot2' was built under R version 3.5.3
library(dplyr)
## Warning: package 'dplyr' was built under R version 3.5.3
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':

##
## filter, lag
## The following objects are masked from 'package:base':

##
## intersect, setdiff, setequal, union
library(ggplot2)
library(ggpubr)
## Warning: package 'ggpubr' was built under R version 3.5.3
## Loading required package: magrittr
## Warning: package 'magrittr' was built under R version 3.5.3
library(psycho)
## Warning: package 'psycho' was built under R version 3.5.3
## message: psycho's `analyze()` is deprecated in favour of the report packag

e. Check it out at https://github.com/easystats/report
library(tidyverse)
## Warning: package 'tidyverse' was built under R version 3.5.3
## -- Attaching packages -------------------------------------------- tidyver

se 1.2.1 --
## v tibble 2.0.1 v purrr 0.3.1
## v tidyr 0.8.3 v stringr 1.4.0
## v readr 1.3.1 v forcats 0.4.0
## Warning: package 'tibble' was built under R version 3.5.3
## Warning: package 'tidyr' was built under R version 3.5.3
## Warning: package 'readr' was built under R version 3.5.3
## Warning: package 'purrr' was built under R version 3.5.3
## Warning: package 'stringr' was built under R version 3.5.3
## Warning: package 'forcats' was built under R version 3.5.3
## -- Conflicts ----------------------------------------------- tidyverse_con

flicts() --
## x tidyr::extract() masks magrittr::extract()
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
## x purrr::lift() masks caret::lift()
## x purrr::set_names() masks magrittr::set_names()
library(MASS)
## Warning: package 'MASS' was built under R version 3.5.3
##
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':

##
## select
library(plotrix)
## Warning: package 'plotrix' was built under R version 3.5.3
library(rcompanion)
## Warning: package 'rcompanion' was built under R version 3.5.3
library(pROC)
## Warning: package 'pROC' was built under R version 3.5.3
## Type 'citation("pROC")' for a citation.
##
## Attaching package: 'pROC'
## The following objects are masked from 'package:stats':
##
## cov, smooth, var
country<-read.csv("E:/PS-STIS/TINGKAT 4/Semester 7/Data Mining/Pertemuan 7/Co

untryData.csv")
country$fert = ifelse(is.na(country$fert),ave(country$fert, FUN = function (x
) mean (x,na.rm=TRUE)),country$fert)
country$pop = ifelse(is.na(country$pop),ave(country$pop, FUN = function (x) m
ean (x,na.rm=TRUE)),country$pop)
country$birth = ifelse(is.na(country$birth),ave(country$birth, FUN = function
(x) mean (x,na.rm=TRUE)),country$birth)
country$death = ifelse(is.na(country$death),ave(country$death, FUN = function
(x) mean (x,na.rm=TRUE)),country$death)
country$infant = ifelse(is.na(country$infant),ave(country$infant, FUN = funct
ion (x) mean (x,na.rm=TRUE)),country$infant)
country$life = ifelse(is.na(country$life),ave(country$life, FUN = function (x
) mean (x,na.rm=TRUE)),country$life)
country$labor = ifelse(is.na(country$labor),ave(country$labor, FUN = function
(x) mean (x,na.rm=TRUE)),country$labor)
country$tax = ifelse(is.na(country$tax),ave(country$tax, FUN = function (x) m
ean (x,na.rm=TRUE)),country$tax)
country$gold = ifelse(is.na(country$gold),ave(country$gold, FUN = function (x
) mean (x,na.rm=TRUE)),country$gold)
country$imports = ifelse(is.na(country$imports),ave(country$imports, FUN = fu
nction (x) mean (x,na.rm=TRUE)),country$imports)
country$mainlines = ifelse(is.na(country$mainlines),ave(country$mainlines, FU
N = function (x) mean (x,na.rm=TRUE)),country$mainlines)
summary(country$fert)
## Min. 1st Qu. Median Mean 3rd Qu. Max.

## 0.800 1.770 2.260 2.582 2.860 6.890
country$grpfert<- cut(country$fert,breaks = c(-Inf, 2.582, Inf), labels = c("

Under Mean Birth", "Over Mean Birth"))
country2 <- country[, c("pop", "birth", "death", "infant", "life", "labor" ,
"tax", "gold","imports" , "mainlines", "grpfert")]
country2$grpfert = factor(country2$grpfert, levels = c("Under Mean Birth", "O

ver Mean Birth"))
trainingRows=sample(1:nrow(country2), 0.75 * nrow(country2))

trainingData=country2[trainingRows, ]
testData=country2[-trainingRows, ]
summary(trainingData)
## pop birth death infant

## Min. :4.800e+01 Min. : 6.72 Min. : 1.530 Min. : 1.81
## 1st Qu.:3.207e+05 1st Qu.:11.84 1st Qu.: 6.325 1st Qu.: 6.91
## Median :5.160e+06 Median :18.44 Median : 7.907 Median : 16.29
## Mean :2.512e+07 Mean :19.62 Mean : 8.087 Mean : 24.76
## 3rd Qu.:2.315e+07 3rd Qu.:24.52 3rd Qu.: 9.495 3rd Qu.: 37.62
## Max. :1.236e+09 Max. :46.12 Max. :17.490 Max. :117.23
## life labor tax gold
## Min. :49.44 Min. : 15 Min. : 3.70 Min. :3.208e+07
## 1st Qu.:67.70 1st Qu.: 195875 1st Qu.:21.07 1st Qu.:3.344e+09
## Median :72.97 Median : 2362000 Median :29.19 Median :4.879e+10
## Mean :71.55 Mean : 11261475 Mean :29.15 Mean :6.276e+10
## 3rd Qu.:78.03 3rd Qu.: 11250000 3rd Qu.:35.00 3rd Qu.:8.178e+10
## Max. :89.57 Max. :487300000 Max. :79.60 Max. :1.268e+12
## imports mainlines grpfert
## Min. :9.038e+06 Min. : 1900 Under Mean Birth:137
## 1st Qu.:2.286e+09 1st Qu.: 58750 Over Mean Birth : 55
## Median :1.049e+10 Median : 495000
## Mean :8.059e+10 Mean : 4518217
## 3rd Qu.:9.309e+10 3rd Qu.: 4599500
## Max. :2.273e+12 Max. :139000000
head(trainingData)
## pop birth death infant life labor tax

## 238 35918915 44.17000 10.970000 60.82000 54.46000 17400000 14.20000
## 148 3516806 31.83000 8.350000 56.06000 62.28000 1318000 40.10000
## 88 4935880 12.93000 10.770000 16.68000 75.72000 1959000 30.30000
## 6 85458 8.48000 6.820000 3.69000 82.65000 36060 21.40000
## 184 10813834 9.42000 10.970000 4.48000 79.01000 5395000 43.50000
## 194 31530 19.65906 7.906978 24.48442 71.79928 17300 29.19209
## gold imports mainlines grpfert
## 238 3579000000 4858000000 315000 Over Mean Birth
## 148 81784312663 3355000000 65100 Over Mean Birth
## 88 3317000000 7064000000 1276000 Under Mean Birth
## 6 81784312663 1430000000 39000 Under Mean Birth
## 184 22660000000 59000000000 4558000 Under Mean Birth
## 194 81784312663 93093954222 5367557 Under Mean Birth
1. Linear Discriminant Analysis

model_lda <- lda(grpfert ~ pop+birth+death+infant+life+labor+tax+gold+imports
+mainlines, data=trainingData)
model_lda
## Call:
## lda(grpfert ~ pop + birth + death + infant + life + labor + tax +
## gold + imports + mainlines, data = trainingData)
##
## Prior probabilities of groups:
## Under Mean Birth Over Mean Birth
## 0.7135417 0.2864583
##
## Group means:
## pop birth death infant life labor
## Under Mean Birth 28599924 14.62102 7.814522 14.23520 75.33274 13279914
## Over Mean Birth 16454099 32.08745 8.767273 50.97491 62.11818 6233726
## tax gold imports mainlines
## Under Mean Birth 30.65330 75269965590 107841140590 6141750.9
## Over Mean Birth 25.39019 31601898229 12699399321 474140.3
##
## Coefficients of linear discriminants:
## LD1
## pop 1.701532e-08
## birth 2.515883e-01
## death 4.303350e-02
## infant -1.651347e-02
## life 1.638464e-02
## labor -4.636950e-08
## tax 3.055525e-03
## gold 6.476736e-13
## imports 1.393661e-12
## mainlines -1.414573e-08
plot(model_lda)
Prior Probability of Groups
Under Mean Birth Over Mean Birth

0.7135417 0.2864583
LDA determines group means and computes, for each individual, the probability of
belonging to the different groups. The individual is then affected to the group with the highest
probability score. Prior probability of groups shows the proportion of training observations in each
group. In this case, there are 71% of the country in Under Mean Birth group, while the rest is in
the opposite group.
Group Means
Variable Under Mean Birth Over Mean Birth

Pop 28599924 16454099
Birth 14.62102 32.08745
Death 7.814522 8.767273
Infant 14.23520 50.97491
Life 75.33274 62.11818
Labor 13279914 6233726
Tax 30.65330 25.39019
Gold 75269965590 31601898229
Imports 107841140590 12699399321
Mainlines 6141750.9 474140.3
Group means shows the mean of each variable in each group. For instance, in Under Mean
Birth, the mean of infant deaths is 14 per 1000 live birth. While, in Over Mean Birth, the mean of
infant deaths is 51 per 1000 live birth.
The coefficient of linear discriminants shows the linear combination of predictor variables that are
used to form the LDA decision rule. In this case, the linear combination obtained is
LD1 = 1.701532e-08 (pop) + 2.515883e-01 (birth) + 4.303350e-02 (death) - 1.651347e-02

(infant) + 1.638464e-02 (life) – 4.636950e-08 (labor) + 3.055525e-03 (tax) +
6.476736e-13 (gold) + 1.393661e-12 (imports) - 1.414573e-08 (mainlines)
#CONFUSION MATRIX
predict_lda <- predict(model_lda, testData[,-11])
predict_lda
CM_lda <- table(testData$grpfert, predict_lda$class)

CM_lda
##
## Under Mean Birth 43 1
## Over Mean Birth 8 12
Confusion Matrix of LDA
Birth Category Under Mean (Prediction) Over Mean (Prediction)

Under Mean 43 1
Over Mean 8 12
Confusion matrix calculates a cross-tabulation of observed and predicted classes with associated
statistics. The result for Under Mean group shows that 43 out of 44 was classified correctly,
while in Over Mean group 12 out of 20 was classified correctly.
#percent correct for each category of G

diag(prop.table(CM_lda,1))

## 0.9772727 0.6000000
#total percent correct

sum(diag(prop.table(CM_lda)))
## [1] 0.859375
accuracy_lda <- (sum(diag(CM_lda)))/sum(CM_lda)

accuracy_lda
## [1] 0.859375
sensitivity(CM_lda)
## [1] 0.8431373
specificity(CM_lda)
## [1] 0.9230769
Percent Correct for each Category
Under Mean (PPV) Over Mean (NPV) Accuracy

0.9772727 0.6000000 0.859375
#ROC table
lda.roc <- roc(testData$grpfert, ordered(predict_lda$class), levels= c("Under

Mean Birth","Over Mean Birth"), direction = "<")
plot.roc(lda.roc, print.auc = T, main="LDA ROC Curve")
2. Logistic Regression
library(ISLR)
## Warning: package 'ISLR' was built under R version 3.5.3
library(corrplot)
## Warning: package 'corrplot' was built under R version 3.5.3
## corrplot 0.84 loaded
library(caret)
library(pROC)
correlations <- cor(country2[,1:10])

corrplot(correlations, method="circle")
Correlation matrix for the variables in the dot form. A dot-representation was used where blue
represents positive correlation and red negative. The larger the dot the larger the correlation.
or.mod=train(grpfert~.,data = trainingData,method="LogitBoost")
or.mod
## Boosted Logistic Regression

##
## 192 samples
## 10 predictor
## 2 classes: 'Under Mean Birth', 'Over Mean Birth'
##
## No pre-processing
## Resampling: Bootstrapped (25 reps)
## Summary of sample sizes: 192, 192, 192, 192, 192, 192, ...
## Resampling results across tuning parameters:
##
## nIter Accuracy Kappa
## 11 0.9770124 0.9447095
## 21 0.9760514 0.9424233
## 31 0.9725441 0.9334644
##
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was nIter = 11.
#prediksi
pr2_or=predict(or.mod, testData[,-11])
pr2_or
#CM
tab2<-table(testData$grpfert,pr2_or)
tab2
## pr2_or
cm2=confusionMatrix(tab2)
cm2
## Confusion Matrix and Statistics

##
## pr2_or
##
## Accuracy : 0.9688
## 95% CI : (0.8916, 0.9962)
## No Information Rate : 0.7188
## P-Value [Acc > NIR] : 2.217e-07
##
## Kappa : 0.9252
##
## Mcnemar's Test P-Value : 0.4795
##
## Sensitivity : 0.9565
## Specificity : 1.0000
## Pos Pred Value : 1.0000
## Neg Pred Value : 0.9000
## Prevalence : 0.7188
## Detection Rate : 0.6875
## Detection Prevalence : 0.6875
## Balanced Accuracy : 0.9783
##
## 'Positive' Class : Under Mean Birth
##
accuracy_lr <- (sum(diag(tab2)))/sum(tab2)

accuracy_lr
## [1] 0.96875
Confusion Matrix of Logistic Regression

Under Mean 44 0
Over Mean 2 18

0.95652 1 0.96875
#ROC table
logreg.roc <- roc(testData$grpfert, ordered(pr2_or), levels= c("Under Mean Bi

rth","Over Mean Birth"), direction = "<")
plot.roc(logreg.roc, print.auc = T, main="Logistic Regression ROC Curve")
3. K-Nearest Neighbour Function

library(kknn)
## Warning: package 'kknn' was built under R version 3.5.3
##
## Attaching package: 'kknn'
## The following object is masked from 'package:caret':

##
## contr.dummy
model_knn <- train.kknn(grpfert ~ ., data = trainingData, kmax=30)

model_knn
##
## Call:
## train.kknn(formula = grpfert ~ ., data = trainingData, kmax = 30)
##
## Type of response variable: nominal
## Minimal misclassification: 0.05208333
## Best kernel: optimal
## Best k: 5
summary(model_knn)
##
## Call:
## train.kknn(formula = grpfert ~ ., data = trainingData, kmax = 30)
##
## Type of response variable: nominal
## Minimal misclassification: 0.05208333
## Best kernel: optimal
## Best k: 5
model_knn$MISCLASS
## optimal
## 1 0.05729167
## 2 0.05729167
## 3 0.05729167
## 4 0.05729167
## 5 0.05208333
## 6 0.05208333
## 7 0.05208333
## 8 0.05208333
## 9 0.05208333
## 10 0.05729167
## 11 0.05729167
## 12 0.05729167
## 13 0.05729167
## 14 0.05729167
## 15 0.05729167
## 16 0.05729167
## 17 0.05729167
## 18 0.05729167
## 19 0.07291667
## 20 0.07291667
## 21 0.07812500
## 22 0.07812500
## 23 0.07812500
## 24 0.07812500
## 25 0.07812500
## 26 0.07812500
## 27 0.07812500
## 28 0.07812500
## 29 0.07812500
## 30 0.07291667
Application of k-nearest neighbor method conducted by split the data into two groups, train and
test data. The train set was using 75% part of the data, while the rest considered as test set.
Output:
Type of response variable: nominal

Minimal misclassification: 0.05208333
Best kernel: optimal
Best k: 5
Misclassification Details
K Misclassification K Misclassification
1 0.05729167 6 0.05208333
2 0.05729167 7 0.05208333
3 0.05729167 8 0.05208333
4 0.05729167
5 0.05208333
In the test, maximum number of k applied was 30, meanwhile the optimal number of
nearest neighbors obtained was 5.
predictionknn <- predict(model_knn, testData[,-11])

predictionknn
CMknn <- table(testData$grpfert, predictionknn)

CMknn
## predictionknn
accuracy_knn <- (sum(diag(CMknn)))/sum(CMknn)

accuracy_knn
## [1] 0.921875
sensitivity(CMknn)
## [1] 0.9148936
specificity(CMknn)
## [1] 0.9411765
Confusion Matrix of K-Nearest Neighbour

Under Mean 43 1
Over Mean 4 16
knn.roc <- roc(testData$grpfert, ordered(predictionknn), levels= c("Under Mea
n Birth","Over Mean Birth"), direction = "<")
plot.roc(knn.roc, print.auc = T, main="KNN Regression ROC Curve")
4. Decision Tree
library(party)
## Warning: package 'party' was built under R version 3.5.3
## Loading required package: grid
## Loading required package: mvtnorm
## Warning: package 'mvtnorm' was built under R version 3.5.2
## Loading required package: modeltools
## Warning: package 'modeltools' was built under R version 3.5.2
## Loading required package: stats4
## Loading required package: strucchange
## Warning: package 'strucchange' was built under R version 3.5.3
## Loading required package: zoo
## Warning: package 'zoo' was built under R version 3.5.3

##
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':

##
## as.Date, as.Date.numeric
## Loading required package: sandwich
## Warning: package 'sandwich' was built under R version 3.5.3
##
## Attaching package: 'strucchange'
## The following object is masked from 'package:stringr':

##
## boundary
output.tree <- ctree(grpfert ~ ., data= trainingData)

output.tree
##
## Conditional inference tree with 2 terminal nodes
##
## Response: grpfert
## Inputs: pop, birth, death, infant, life, labor, tax, gold, imports, mainl
ines
## Number of observations: 192
##
## 1) birth <= 21.85; criterion = 1, statistic = 135.95
## 2)* weights = 137
## 1) birth > 21.85
## 3)* weights = 55
#Show built model

plot(output.tree)
Decision tree method was applied to the data by similar technique as the k-nearest neighb
or. That is splitting the data into two groups, train and test data. The train set was using 75% part
of the data, while the rest considered as test set.
#PREDICTION
predictiondt <- predict(output.tree, testData[,-11])
CMtree <- table(testData$grpfert, predictiondt)
CMtree
## predictiondt
accuracy_CMtree <- (sum(diag(CMtree)))/sum(CMtree)

accuracy_CMtree
## [1] 0.984375
sensitivity(CMtree)
## [1] 0.9777778
specificity(CMtree)
## [1] 1
Confusion Matrix of Decision Tree

Under Mean 44 0
Over Mean 1 19
DT.roc <- roc(testData$grpfert, ordered(predictiondt), levels= c("Under Mean
Birth","Over Mean Birth"), direction = "<")
plot.roc(DT.roc, print.auc = T, main="Decision Tree Regression ROC Curve")
5. Random Forest
library(randomForest)
## Warning: package 'randomForest' was built under R version 3.5.3
## randomForest 4.6-14
## Type rfNews() to see new features/changes/bug fixes.

##
## Attaching package: 'randomForest'
## The following object is masked from 'package:dplyr':

##
## combine
## The following object is masked from 'package:ggplot2':

##
## margin
#buat model random forest

output.forest <- randomForest(grpfert ~ ., data=trainingData)
output.forest
##
## Call:
## randomForest(formula = grpfert ~ ., data = trainingData)
## Type of random forest: classification
## Number of trees: 500
## No. of variables tried at each split: 3
##
## OOB estimate of error rate: 1.04%
## Confusion matrix:
## Under Mean Birth Over Mean Birth class.error
## Under Mean Birth 136 1 0.00729927
## Over Mean Birth 1 54 0.01818182
Classification using random decision forest method was conducted to the data. The data was split
into two groups, train set which was using 75% part of the data, and test set for the rest 25%.
Type of random forest: classification
Number of trees: 500
No. of variables tried at each split: 3
OOB estimate of error rate: 1.04%
Confusion Matrix of Train Set

Under Mean 136 1
Over Mean 1 54
#validasi model
prediksi_rf <- predict(output.forest, testData[,-11])
#confusion matrix
CM_rf <- table(testData$grpfert, prediksi_rf)
CM_rf
## prediksi_rf
Confusion Matrix of Test Set

Under Mean 44 0
Over Mean 1 19

diag(prop.table(CM_rf,1))

## 1.00 0.95

sum(diag(prop.table(CM_rf)))
## [1] 0.984375
accuracy_rf <- (sum(diag(CM_rf)))/sum(CM_rf)

accuracy_rf
## [1] 0.984375
sensitivity(CM_rf)
## [1] 0.9777778
specificity(CM_rf)
## [1] 1

1.00 0.95 0.984375
RF.roc <- roc(testData$grpfert, ordered(prediksi_rf), levels= c("Under Mean B

irth","Over Mean Birth"), direction = "<")
plot.roc(RF.roc, print.auc = T, main="Random Forest ROC Curve")
6. Naive Bayes
library(e1071)
## Warning: package 'e1071' was built under R version 3.5.3
library(naivebayes)
## Warning: package 'naivebayes' was built under R version 3.5.3
## naivebayes 0.9.6 loaded
model_nb<- naiveBayes(grpfert~., data=trainingData)

print(model_nb)
##
## Naive Bayes Classifier for Discrete Predictors
##
## Call:
## naiveBayes.default(x = X, y = Y, laplace = laplace)
##
## A-priori probabilities:
## Y
## 0.7135417 0.2864583
##
## Conditional probabilities:
## pop
## Y [,1] [,2]
## Under Mean Birth 28599924 112638536
## Over Mean Birth 16454099 27478113
##
## birth
## Y [,1] [,2]
## Under Mean Birth 14.62102 4.376068
## Over Mean Birth 32.08745 6.445796
##
## death
## Y [,1] [,2]
## Under Mean Birth 7.814522 2.659023
## Over Mean Birth 8.767273 3.378159
##
## infant
## Y [,1] [,2]
## Under Mean Birth 14.23520 11.05347
## Over Mean Birth 50.97491 25.85683
##
## life
## Y [,1] [,2]
## Under Mean Birth 75.33274 5.619746
## Over Mean Birth 62.11818 7.861525
##
## labor
## Y [,1] [,2]
## Under Mean Birth 13279914 45815657
## Over Mean Birth 6233726 9543885
##
## tax
## Y [,1] [,2]
## Under Mean Birth 30.65330 11.84013
## Over Mean Birth 25.39019 10.97577
##
## gold
## Y [,1] [,2]
## Under Mean Birth 75269965590 127941891477
## Over Mean Birth 31601898229 42659520087
##
## imports
## Y [,1] [,2]
## Under Mean Birth 107841140590 249494276730
## Over Mean Birth 12699399321 23700059683
##
## mainlines
## Y [,1] [,2]
## Under Mean Birth 6141750.9 15276853
## Over Mean Birth 474140.3 1031074
Naïve Bayes method use probability as basic of classification as the implications of Bayes Theor
em. The assumptions need to be fulfilled when using this method is each attribute need to be inde
pendent and have the same priority. The data processed with this method need to be split into tw
o groups, train set and test set. In this case, train set was using 75% of the data, while the other 2
5% used as the test set.
predict_nb<-predict(model_nb,testData[,-11])
predict_nb
CM_nb <- table(testData$grpfert, predict_nb)

CM_nb
## predict_nb
confusionMatrix(CM_nb)

##
## predict_nb
##
## Accuracy : 0.75
## 95% CI : (0.626, 0.8498)
## No Information Rate : 0.5625
## P-Value [Acc > NIR] : 0.001485
##
## Kappa : 0.4754
##
## Mcnemar's Test P-Value : 0.080118
##
## Specificity : 0.5714
## Pos Pred Value : 0.7273
## Neg Pred Value : 0.8000
## Balanced Accuracy : 0.7302
##
##
Confusion Matrix of Naive Bayes

Under Mean 32 12
Over Mean 4 16
NB.roc <- roc(testData$grpfert, ordered(predict_nb), levels= c("Under Mean Bi

plot.roc(NB.roc, print.auc = T, main="Naive Bayes ROC Curve")
7. Support Vector Machine

model_SVM = svm(formula = grpfert ~ .,
data = trainingData,
type = 'C-classification',
kernel = 'linear')
model_SVM
##
## Call:
## svm(formula = grpfert ~ ., data = trainingData, type = "C-classification",
## kernel = "linear")
##
##
## Parameters:
## SVM-Type: C-classification
## SVM-Kernel: linear
## cost: 1
##
## Number of Support Vectors: 30
Support Vector Machine (SVM) method use point in n-dimensional space and hyper-plane
to help classify the item. Not only linear, it also could be used in non-linear classification by using
“kernel trick”. In the first place, a classifier needs to be built as the hyper-plane. Fert category was
used as the factor variable in this case.
Parameters:
SVM-Type: C-classification
SVM-Kernel: linear
cost: 1
Number of Support Vectors: 30
Kernel linear type was used and resulting the number of support vectors by 30. This show
s the number of points that are close to the boundary or on the wrong side of the boundary.
# Predicting the Test set results
predict_SVM = predict(model_SVM, newdata = testData[-11])
predict_SVM
# Making the Confusion Matrix

cm_svm = table(testData[, 11], predict_SVM)
cm_svm
## predict_SVM

diag(prop.table(cm_svm,1))

## 1.0 0.9

sum(diag(prop.table(cm_svm)))
## [1] 0.96875
sensitivity(cm_svm)
## [1] 0.9565217
specificity(cm_svm)
## [1] 1
Confusion Matrix of SVM

Under Mean 44 0
Over Mean 2 18
#ROC table
svm.roc <- roc(testData$grpfert, ordered(predict_SVM), levels= c("Under Mean

Birth","Over Mean Birth"), direction = "<")
plot.roc(svm.roc, print.auc = T, main="SVM ROC Curve")
8. Neural Networks
library(mlbench)
## Warning: package 'mlbench' was built under R version 3.5.3
library(nnet)
nnet.mod=train(grpfert~.,data = trainingData,method="nnet")
predict_nn<-predict(nnet.mod,testData[,-11])
predict_nn
Neural networks method works by interconnect information processing units to transform input i
nto output using activation function. Construction of this method consist of input layer, hidden la
yer, and output layer.
CM_nn <- table(testData$grpfert, predict_nn)
CM_nn
## predict_nn
confusionMatrix(CM_nn)

##
## predict_nn
##
## Accuracy : 0.6875
## 95% CI : (0.5594, 0.7976)
## No Information Rate : 1
## P-Value [Acc > NIR] : 1
##
## Kappa : 0
##
## Mcnemar's Test P-Value : 2.152e-05
##
## Specificity : NA
## Pos Pred Value : NA
## Neg Pred Value : NA
## Balanced Accuracy : NA
##
##
Confusion Matrix of SVM

Under Mean 44 0
Over Mean 20 0
The confusion matrix of this model shows 44 out of 64 predictions of Under Mean category was
classified correctly. Total accuracy obtained from this model is 68.75%
NN.roc <- roc(testData$grpfert, ordered(predict_nn), levels= c("Under Mean Bi
plot.roc(NN.roc, print.auc = T, main="Neural Network ROC Curve")
Conclusion and Discussion
Supervised analysis conducted to the data using various model as listed below. Properties
of model such as sensitivity, specificity, and accuracy were also collected from the output.
Comparation of Model
Model Sensitivity Specificity Accuracy AUC

Linear Discriminant 0.8431 0.9230 0.8593 0.7890
Logistic Regression 0.9565 1 0.9688 0.9500
k-Nearest Neighbor 0.9148 0.9411 0.9218 0.8890
Decision Tree 0.9777 1 0.9843 0.9750
Random Forest 0.9777 1 0.9843 0.9750
Naïve Bayes 0.8889 0.5714 0.7500 0.7640
SVM 0.9565 1 0.9687 0.9687
Neural Networks 0.6875 NA 0.6875 0.5000
Considering these properties, the best model to classify data is Decision Tree and Random
Forest. Those model was chosen because the value of area under curve (AUC) shows ability to
distinguish two diagnostic group by 97,50%. Another consideration was about its consistence in
sensitivity, specificity, and accuracy properties.

DATA CLASSIFICATION USING SUPERVISED LEARNING (DATA MINING

Încărcat de

Informații document

Titlu original

Drepturi de autor

Formate disponibile

Partajați acest document

Partajați sau inserați document

Opțiuni de partajare

Vi se pare util acest document?

Este necorespunzător acest conținut?

Drepturi de autor:

Formate disponibile

DATA CLASSIFICATION USING SUPERVISED LEARNING (DATA MINING

Încărcat de

Drepturi de autor:

Formate disponibile

DATA CLASSIFICATION USING SUPERVISED

LEARNING METHOD (DATA MINING)

POLITEKNIK STATISTIKA STIS

## Warning: package 'class' was built under R version 3.5.3

## Warning: package 'nnet' was built under R version 3.5.3

## Warning: package 'caret' was built under R version 3.5.3

## Loading required package: lattice

## Loading required package: ggplot2

## Warning: package 'ggplot2' was built under R version 3.5.3

## Warning: package 'dplyr' was built under R version 3.5.3

## The following objects are masked from 'package:stats':

## The following objects are masked from 'package:base':

## Warning: package 'ggpubr' was built under R version 3.5.3

## Loading required package: magrittr

## Warning: package 'magrittr' was built under R version 3.5.3

## Warning: package 'psycho' was built under R version 3.5.3

## message: psycho's `analyze()` is deprecated in favour of the report packag

## Warning: package 'tidyverse' was built under R version 3.5.3

## -- Attaching packages -------------------------------------------- tidyver

## Warning: package 'tibble' was built under R version 3.5.3

## Warning: package 'tidyr' was built under R version 3.5.3

## Warning: package 'readr' was built under R version 3.5.3

## Warning: package 'purrr' was built under R version 3.5.3

## Warning: package 'stringr' was built under R version 3.5.3

## Warning: package 'forcats' was built under R version 3.5.3

## -- Conflicts ----------------------------------------------- tidyverse_con

## Warning: package 'MASS' was built under R version 3.5.3

## The following object is masked from 'package:dplyr':

## Warning: package 'plotrix' was built under R version 3.5.3

## Warning: package 'rcompanion' was built under R version 3.5.3

## Warning: package 'pROC' was built under R version 3.5.3

## Type 'citation("pROC")' for a citation.

country<-read.csv("E:/PS-STIS/TINGKAT 4/Semester 7/Data Mining/Pertemuan 7/Co

## Min. 1st Qu. Median Mean 3rd Qu. Max.

country$grpfert<- cut(country$fert,breaks = c(-Inf, 2.582, Inf), labels = c("

country2$grpfert = factor(country2$grpfert, levels = c("Under Mean Birth", "O

trainingRows=sample(1:nrow(country2), 0.75 * nrow(country2))

## pop birth death infant

## pop birth death infant life labor tax

1. Linear Discriminant Analysis

Prior Probability of Groups

Under Mean Birth Over Mean Birth

Variable Under Mean Birth Over Mean Birth

LD1 = 1.701532e-08 (pop) + 2.515883e-01 (birth) + 4.303350e-02 (death) - 1.651347e-02

CM_lda <- table(testData$grpfert, predict_lda$class)

Confusion Matrix of LDA

Birth Category Under Mean (Prediction) Over Mean (Prediction)

#percent correct for each category of G

## Under Mean Birth Over Mean Birth

#total percent correct

accuracy_lda <- (sum(diag(CM_lda)))/sum(CM_lda)

Percent Correct for each Category

Under Mean (PPV) Over Mean (NPV) Accuracy

lda.roc <- roc(testData$grpfert, ordered(predict_lda$class), levels= c("Under

## Warning: package 'ISLR' was built under R version 3.5.3

## Warning: package 'corrplot' was built under R version 3.5.3

## corrplot 0.84 loaded

correlations <- cor(country2[,1:10])

## Boosted Logistic Regression

## Confusion Matrix and Statistics

accuracy_lr <- (sum(diag(tab2)))/sum(tab2)

Birth Category Under Mean (Prediction) Over Mean (Prediction)

Under Mean (PPV) Over Mean (NPV) Accuracy