Sunteți pe pagina 1din 12

# SVM for Binary-Class #

# SVM Analysis for Binary-Class

#Set Directory

options(max.print=1000000)

setwd("E:\\Prabu\\Capstone")

getwd()

#Install Libraries needed

library(DMwR) #needed for SMOTE

library(psych)

library(ggplot2)

library(corrplot)

library(caret)

library(lattice)

library(MASS)

library(devtools)

library(ggord)

library(klaR) # For partition plot

library(precrec) # For ROC curve plot

#Data Exploration#
Dataset <- read.csv("New_CAPSTONE_DATA_v2.csv", header = TRUE)

#Create dataframe for LDA

Df2 <- Dataset

Df2 # view the dataframe


str(Df2) # structure of the dataframe

> str(Df2) #(remove hash if you want to view the structure of the dataframe)
'data.frame': 471 obs. of 24 variables:
$ Company : Factor w/ 8 levels "BP","Chevron",..: 1 1 1 1 1 1
1 1 1 1 ...
$ Period : Factor w/ 59 levels "30-06-05","30-06-06",..: 31
1 16 46 32 2 17 47 33 3 ...
$ Curr.Ratio : num 0.992 1.015 1.145 1.046 1.048 ...
$ Quick.ratio : num 0.73 0.737 0.883 0.771 0.786 ...
$ LT.Debt.Eq.Ratio : num 0.344 0.368 0.38 0.36 0.353 ...
$ Cash.LT.Debt.Ratio : num 0.0553 0.0462 0.0693 0.4492 0.3977 ...
$ PAT.Asset.Ratio : num 0.1338 0.1114 0.1206 0.0651 0.1086 ...
$ Asset.Turnover.Ratio : num 1.1 1.18 1.27 1.23 1.25 ...
$ Inventory.Turnover.Ratio: num 13.1 13.2 13.9 12.9 13.8 ...
$ Return.on.Equity : num 0.33 0.279 0.313 0.167 0.279 ...
$ Net.asset : num 79911 80097 82726 80450 80566 ...
$ Cash.Asset.Ratio : num 0.0077 0.00678 0.01018 0.06291 0.05466 ...
$ Cash.Turnover.Ratio : num 142.7 174.7 124.6 19.6 22.9 ...
$ LT.Debt.Turnover.Ratio : num 7.89 8.06 8.64 8.81 9.1 ...
$ LT.Debt.PAT.Ratio : num 0.96 0.759 0.821 0.465 0.79 ...
$ Revenue : num 54274 59404 67991 63817 64771 ...
$ Net.Income : num 6602 5591 6463 3368 5623 ...
$ Cash : num 1521 1360 2182 13016 11320 ...
$ Total.Curr.Liabilites : num 63145 64968 74297 71997 71845 ...
$ LTDebt : num 27520 29463 31473 28979 28466 ...
$ Moody : Factor w/ 15 levels "A1","A2","A3",..: 4 4 4 4 4
4 4 4 4 4 ...
$ Crane : Factor w/ 6 levels "S1","S2","S3",..: 2 2 2 2 2 2
2 2 2 2 ...
$ HML : Factor w/ 3 levels "H","L","M": 2 2 2 2 2 2 2 2 2
2 ...
$ Logit : int 1 1 1 1 1 1 1 1 1 1 ...

#Remove labels not used

Df2$Company <- NULL

Df2$Moody <- NULL

Df2$Crane <- NULL

Df2$Period <- NULL

Df2$Logit <- NULL

Df2
#Create correlation matrix

pairs.panels(Df2[1:17],

gap=0,

bg = c("red","blue")[Df2$Logit],

pch = 21)

# Multicollinearity

Correlation <- cor(Df2[,1:17])

corrplot(Correlation,type = "upper", method = "color")

corrplot(Correlation,method = "color")

corrplot(Correlation,method = "shade")

corrplot(Correlation, order = "AOE")


# Split between training & test

set.seed(123)

Split2 <- sample(2,nrow(Df2),

replace = TRUE,

prob = c(0.75,0.25))

training2 <- Df2[Split2==1,]

testing2 <- Df2[Split2==2,]


#SVM
# SMOTE for improving participation of all the label classes

training_SMOTE <- SMOTE(Logit ~ ., training, perc.over = 500,perc.under=100, Learner = NULL)

# Application of Support Vector Machine (SVM) to the LDA model

library(DiscriMiner)

names(Df)

[1] "Curr.Ratio" "Quick.ratio" "LT.Debt.Eq.Ratio"


[4] "Cash.LT.Debt.Ratio" "PAT.Asset.Ratio"
"Asset.Turnover.Ratio"
[7] "Inventory.Turnover.Ratio" "Return.on.Equity" "Net.asset"
[10] "Cash.Asset.Ratio" "Cash.Turnover.Ratio"
"LT.Debt.Turnover.Ratio"
[13] "LT.Debt.PAT.Ratio" "Revenue" "Net.Income"
[16] "Cash" "Total.Curr.Liabilites" "LTDebt"
[19] "Logit"

x1=Df[,1:18]

y1=Df[,19]

Fisher=desDA(x1,y1)

Fisher

Descriptive Discriminant Analysis


---------------------------------
$power discriminant power
$values table of eigenvalues
$discrivar discriminant variables
$discor correlations
$scores discriminant scores
---------------------------------

$power
cor_ratio wilks_lamb
F_statistic
Curr.Ratio 0.236923998891409998 0.763076001108589974
145.617678080087699755
Quick.ratio 0.118343655400584433 0.881656344599415553
62.953297759221825913
LT.Debt.Eq.Ratio 0.162736378428015166 0.837263621571984751
91.158100646293434011
Cash.LT.Debt.Ratio 0.004669637099692534 0.995330362900307408
2.200334563666029197
PAT.Asset.Ratio 0.010037044801625842 0.989962955198374184
4.755101175497249422
Asset.Turnover.Ratio 0.134652499119868291 0.865347500880131459
72.978799872868748366
Inventory.Turnover.Ratio 0.139709454209825557 0.860290545790174388
76.164656632631988487
Return.on.Equity 0.004179761881913783 0.995820238118086176
1.968536335756923217
Net.asset 0.074942767269602598 0.925057232730397416
37.995657572127043977
Cash.Asset.Ratio 0.098735330987855224 0.901264669012144637
51.379879657392976355
Cash.Turnover.Ratio 0.060595460945688108 0.939404539054311871
30.252431196614281816
LT.Debt.Turnover.Ratio 0.108006820771201800 0.891993179228798172
56.788773862025770711
LT.Debt.PAT.Ratio 0.025414566265284339 0.974585433734715623
12.230258288123410537
Revenue 0.175461782462952548 0.824538217537047480
99.803228309944628904
Net.Income 0.037613706647427719 0.962386293352572197
18.330298903354027118
Cash 0.017368353729127241 0.982631646270872627
8.289736983206433152
Total.Curr.Liabilites 0.098848299112338237 0.901151700887661944
51.445114333159182252
LTDebt 0.019782683412160728 0.980217316587839171
9.465328109689597369
p_values
Curr.Ratio 0.000000000000000000
Quick.ratio 0.000000000000015765
LT.Debt.Eq.Ratio 0.000000000000000000
Cash.LT.Debt.Ratio 0.138652460368144492
PAT.Asset.Ratio 0.029707822835484898
Asset.Turnover.Ratio 0.000000000000000222
Inventory.Turnover.Ratio 0.000000000000000000
Return.on.Equity 0.161264970321706058
Net.asset 0.000000001529587457
Cash.Asset.Ratio 0.000000000002976508
Cash.Turnover.Ratio 0.000000062455525751
LT.Debt.Turnover.Ratio 0.000000000000252243
LT.Debt.PAT.Ratio 0.000514966622355173
Revenue 0.000000000000000000
Net.Income 0.000022535206089858
Cash 0.004169267460182757
Total.Curr.Liabilites 0.000000000002888800
LTDebt 0.002216377327768915

$values
value proportion accumulated
DF1 2.89 100.00 100.00

$discrivar
DF1
constant 1.650e+00
Curr.Ratio -2.744e+00
Quick.ratio 4.461e-01
LT.Debt.Eq.Ratio -3.557e+00
Cash.LT.Debt.Ratio 1.278e+00
PAT.Asset.Ratio 1.546e+01
Asset.Turnover.Ratio -9.386e-01
Inventory.Turnover.Ratio 1.096e-01
Return.on.Equity -5.430e+00
Net.asset -1.518e-05
Cash.Asset.Ratio -5.101e+00
Cash.Turnover.Ratio -6.227e-05
LT.Debt.Turnover.Ratio -9.931e-02
LT.Debt.PAT.Ratio -7.401e-03
Revenue 7.438e-06
Net.Income -8.124e-05
Cash -1.109e-05
Total.Curr.Liabilites 4.427e-05
LTDebt 3.364e-05

$discor
DF1
Curr.Ratio -0.66015
Quick.ratio -0.46656
LT.Debt.Eq.Ratio -0.54711
Cash.LT.Debt.Ratio -0.09268
PAT.Asset.Ratio 0.13587
Asset.Turnover.Ratio 0.49767

$scores
z1
1 0.2987
2 0.2547
3 0.2667
4 0.5708
5 0.4985
6 0.3676
...

Mahalanobis = linDA(x1,y1)

Mahalanobis

Linear Discriminant Analysis


-------------------------------------------
$functions discrimination functions
$confusion confusion matrix
$scores discriminant scores
$classification assigned class
$error_rate error rate
-------------------------------------------

$functions
0 1
constant -136.956190 -130.225396
Curr.Ratio 48.423178 41.563393
Quick.ratio 6.125443 7.240412
LT.Debt.Eq.Ratio 167.970144 159.079842
Cash.LT.Debt.Ratio 52.235965 55.431653
PAT.Asset.Ratio -517.136381 -478.481813
Asset.Turnover.Ratio 104.444688 102.098668
Inventory.Turnover.Ratio 0.433799 0.707770
Return.on.Equity 163.542647 149.968637
Net.asset 0.001271 0.001233
Cash.Asset.Ratio -145.195091 -157.946677
Cash.Turnover.Ratio 0.077044 0.076889
LT.Debt.Turnover.Ratio -7.459948 -7.708176
LT.Debt.PAT.Ratio 74.316780 74.298280
Revenue -0.001507 -0.001489
Net.Income -0.003264 -0.003467
Cash -0.000975 -0.001003
Total.Curr.Liabilites 0.001064 0.001175
LTDebt -0.001275 -0.001191

$confusion
predicted
original 0 1
0 94 26
1 22 329

$error_rate
[1] 0.1019108

$scores
0 1
1 132.8903 136.2422
2 131.2891 134.5312
3 141.8627 145.1347
4 113.8299 117.8621
5 125.0948 128.9464
6 131.4325 134.9569
...

$classification
[1] 1 1 1 1 1 1
Levels: 0 1
...

#SVM on entire data with binary prediction class

library(e1071)

SVMModel1 <- svm(Logit~.,data = Df, kernel = "radial",cost = 1, scale = FALSE)

summary(SVMModel1)
Call:
svm(formula = Logit ~ ., data = Df, kernel = "radial", cost = 1, scale =
FALSE)

Parameters:
SVM-Type: C-classification
SVM-Kernel: radial
cost: 1

Number of Support Vectors: 471

( 351 120 )

Number of Classes: 2

Levels:
0 1

Prediction <- predict(SVMModel1)

Prediction

table (Actual=Df$Logit,Predicted=Prediction)

Predicted
Actual 0 1
0 120 0
1 0 351

#SVM on training data with binary prediction class

SVMModel2 <- svm(Logit~.,data = training_SMOTE, kernel = "radial",cost = 1, scale = FALSE)

summary(SVMModel2)

Call:
svm(formula = Logit ~ ., data = training_SMOTE, kernel = "radial", cost = 1,
scale = FALSE)

Parameters:
SVM-Type: C-classification
SVM-Kernel: radial
cost: 1

Number of Support Vectors: 943

( 367 576 )
Number of Classes: 2

Levels:
0 1

Prediction2 <- predict(SVMModel2)

Prediction2

table (Actual=training_SMOTE$Logit,Predicted=Prediction2)

Predicted
Actual 0 1
0 576 0
1 0 480

#SVM on testing data with binary prediction class

SVMModel3 <- svm(Logit~.,data = testing, kernel = "radial",cost = 1, scale = FALSE)

summary(SVMModel3)

Call:
svm(formula = Logit ~ ., data = testing, kernel = "radial", cost = 1, scale =
FALSE)

Parameters:
SVM-Type: C-classification
SVM-Kernel: radial
cost: 1

Number of Support Vectors: 111

( 87 24 )

Number of Classes: 2

Levels:
0 1

Prediction3 <- predict(SVMModel3)

Prediction3

table (Actual=testing$Logit,Predicted=Prediction3)

Predicted
Actual 0 1
0 24 0
1 0 87
## K Fold cross validation for SVM

library(kernlab)

# Define training control

set.seed(123)

train.control <- trainControl(method = "cv", number = 10)

# Train the model

model11 <- train(Logit ~., data = Df, method = "svmRadial",

trControl = train.control)

# Summarize the results

print(model11)

Support Vector Machines with Radial Basis Function Kernel

471 samples
18 predictor
2 classes: '0', '1'

No pre-processing
Resampling: Cross-Validated (10 fold)
Summary of sample sizes: 424, 424, 424, 424, 424, 424, ...
Resampling results across tuning parameters:

C Accuracy Kappa
0.25 0.8789894 0.6540279
0.50 0.9002660 0.7331950
1.00 0.9193262 0.7844146

Tuning parameter 'sigma' was held constant at a value of 0.06757086


Accuracy was used to select the optimal model using the largest value.
The final values used for the model were sigma = 0.06757086 and C = 1.

S-ar putea să vă placă și