Sunteți pe pagina 1din 19

#####Check the directory

getwd()

###set the working Directory

setwd("C:/Users/name/R_Directory/Scam_Model")

# set.seed(123)

###Import Train CSV file

train_new <- read.csv("C:/Users/name/R_Directory/Scam_Model/RFM_TRAIN_3.csv", header=TRUE)

####Check format of the dataframe

str(train_new)

#####Check diamension of the dataframe

dim(train_new)

####Get top 3 raws of the dataset

head(train_new)

####Check Summary

summary(train_new$scam_final)
table(train_new$scam_final)

cor_matrix<-cor(train_new)

write.csv(cor_matrix,"cor_matrix.csv")

#####Check the number of unique values

Un_val<-apply(train_new,2,function(x) length(unique(x)))

#####It seems there are 6 variable which is categorical, need to convert them into the factor

####device_js_enabled,device_cookie_enabled,opacket_dfp_device_new,isp_match,

####ip_mismatch,scam_final

cols<-
c("scam_final","device_js_enabled","device_cookie_enabled","opacket_dfp_device_new","ip_mismatch
")

for (i in cols){

train_new[,i]=as.factor(train_new[,i])

str(train_new)

dim(train_new)
##################Build Random forest model#######################

library(randomForest)

#####Get column index number of Response variable###############

set.seed(123)

modelRF_new_5<-randomForest(scam_final~.,data=train_new,ntree=10)

# randomForest(formula = scam_final ~ ., data = train_new, ntree = 10)

# Type of random forest: classification

# Number of trees: 10

# No. of variables tried at each split: 11

# OOB estimate of error rate: 3.78%

# Confusion matrix:

# 0 1 class.error

# 0 55885 645 0.01140987

# 1 1562 352 0.81609195

######If class 1 error increases with increase in nodesize and decrese in maxnodes**********

set.seed(123)

modelRF_new_10_1<-
randomForest(scam_final~.,data=train_new,ntree=10,nodesize=2,maxnodes=10000)

# randomForest(formula = scam_final ~ ., data = train_new, ntree = 10, nodesize = 2, maxnodes =


10000)

# Type of random forest: classification

# Number of trees: 10

# No. of variables tried at each split: 11


# OOB estimate of error rate: 3.72%

# Confusion matrix:

# 0 1 class.error

# 0 55920 612 0.01082573

# 1 1562 355 0.81481481

set.seed(123)

modelRF_new_50<-randomForest(scam_final~.,data=train_new,ntree=50)

# randomForest(formula = scam_final ~ ., data = train_new, ntree = 50)

# Type of random forest: classification

# Number of trees: 50

# No. of variables tried at each split: 11

# OOB estimate of error rate: 3.03%

# Confusion matrix:

# 0 1 class.error

# 0 56960 109 0.001909969

# 1 1680 253 0.869115365

set.seed(123)

modelRF_new_50_1<-
randomForest(scam_final~.,data=train_new,ntree=50,nodesize=2,maxnodes=10000)

# randomForest(formula = scam_final ~ ., data = train_new, ntree = 50, nodesize = 2, maxnodes =


10000)

# Type of random forest: classification

# Number of trees: 50

# No. of variables tried at each split: 11

# OOB estimate of error rate: 3.03%


# Confusion matrix:

# 0 1 class.error

# 0 56979 90 0.001577038

# 1 1696 237 0.877392654

set.seed(123)

modelRF_new_25_1<-
randomForest(scam_final~.,data=train_new,ntree=25,nodesize=1,maxnodes=10000)

# randomForest(formula = scam_final ~ ., data = train_new, ntree = 25, nodesize = 2, maxnodes =


10000)

# Type of random forest: classification

# Number of trees: 25

# No. of variables tried at each split: 11

# OOB estimate of error rate: 3.07%

# Confusion matrix:

# 0 1 class.error

# 0 56908 161 0.002821146

# 1 1649 284 0.853078117

set.seed(123)

modelRF_new_100_1<-
randomForest(scam_final~.,data=train_new,ntree=100,nodesize=2,maxnodes=10000)

# randomForest(formula = scam_final ~ ., data = train_new, ntree = 100, nodesize = 2, maxnodes =


10000)

# Type of random forest: classification

# Number of trees: 100

# No. of variables tried at each split: 11


# OOB estimate of error rate: 2.98%

# Confusion matrix:

# 0 1 class.error

# 0 57009 60 0.001051359

# 1 1700 233 0.879461976

####**********************************************************************###

###Ntree=10 is the optimal ntree value************************************###

###***********************************************************************###

#####Get column index number of Response variable###############

grep("scam_final", colnames(train_new))

####Find out best mtry - tuneRF - makes a step on mtry - mtry will decrease

####by 1.5 for each iteration- if there is improve on the model by 0.01 then

####it will go ahead else it will stop

bestmtry<-tuneRF(train_new[,-c(119)],train_new$scam_final,ntreeTry = 10, stepFactor = 1.5,improve =


0.01,trace = T,plot = T)

# mtry = 11 OOB error = 3.83%

# Searching left ...

# mtry = 8 OOB error = 3.74%

# 0.02349072 0.01

# mtry = 6 OOB error = 3.63%


# 0.0290644 0.01

# mtry = 4 OOB error = 3.52%

# 0.03096187 0.01

# mtry = 3 OOB error = 3.43%

# 0.02621193 0.01

# mtry = 2 OOB error = 3.32%

# 0.03306647 0.01

# Searching right ...

# mtry = 16 OOB error = 3.89%

# -0.1744363 0.01

best.m <- bestmtry[bestmtry[, 2] == min(bestmtry[, 2]), 1]

print(bestmtry)

print(best.m)

# mtry OOBError

# 2.OOB 2 0.03316327

# 3.OOB 3 0.03429736

# 4.OOB 4 0.03522056

# 6.OOB 6 0.03634589

# 8.OOB 8 0.03743389

# 11.OOB 11 0.03833439

# 16.OOB 16 0.03894814

# > print(best.m)

# [1] 2

set.seed(123)

modelRF_new_10_16<-
randomForest(scam_final~.,data=train_new,mtry=16,ntree=10,nodesize=2,maxnodes=10000)
# randomForest(formula = scam_final ~ ., data = train_new, mtry = 16, ntree = 10, nodesize = 2,
maxnodes = 10000)

# Type of random forest: classification

# Number of trees: 10

# No. of variables tried at each split: 16

# OOB estimate of error rate: 3.75%

# Confusion matrix:

# 0 1 class.error

# 0 55893 627 0.01109342

# 1 1563 345 0.81918239

set.seed(123)

modelRF_new_30_16<-
randomForest(scam_final~.,data=train_new,mtry=16,ntree=30,nodesize=2,maxnodes=10000)

# randomForest(formula = scam_final ~ ., data = train_new, mtry = 16, ntree = 30, nodesize = 2,


maxnodes = 10000)

# Type of random forest: classification

# Number of trees: 30

# No. of variables tried at each split: 16

# OOB estimate of error rate: 3.1%

# Confusion matrix:

# 0 1 class.error

# 0 56899 170 0.00297885

# 1 1657 276 0.85721676

bestmtry<-tuneRF(train_new[,-c(119)],train_new$scam_final,ntreeTry = 30, stepFactor = 1.5,improve =


0.001,trace = T,plot = T)
print(bestmtry)

print(best.m)

######*************************************Final
Model*********************************#####

set.seed(123)

modelRF_new_10_16<-
randomForest(scam_final~.,data=train_new,mtry=16,ntree=10,nodesize=2,maxnodes=10000)

# randomForest(formula = scam_final ~ ., data = train_new, mtry = 16, ntree = 10, nodesize = 2,


maxnodes = 10000)

# Type of random forest: classification

# Number of trees: 10

# No. of variables tried at each split: 16

# OOB estimate of error rate: 3.75%

# Confusion matrix:

# 0 1 class.error

# 0 55893 627 0.01109342

# 1 1563 345 0.81918239

set.seed(123)

modelRF_new_10_14<-
randomForest(scam_final~.,data=train_new,mtry=14,ntree=10,nodesize=2,maxnodes=10000)

# randomForest(formula = scam_final ~ ., data = train_new, mtry = 14, ntree = 10, nodesize = 2,


maxnodes = 10000)
# Type of random forest: classification

# Number of trees: 10

# No. of variables tried at each split: 14

# OOB estimate of error rate: 3.7%

# Confusion matrix:

# 0 1 class.error

# 0 55901 587 0.01039159

# 1 1572 339 0.82260597

set.seed(123)

modelRF_new_10_18<-
randomForest(scam_final~.,data=train_new,mtry=18,ntree=10,nodesize=2,maxnodes=10000)

# randomForest(formula = scam_final ~ ., data = train_new, mtry = 18, ntree = 10, nodesize = 2,


maxnodes = 10000)

# Type of random forest: classification

# Number of trees: 10

# No. of variables tried at each split: 18

# OOB estimate of error rate: 3.75%

# Confusion matrix:

# 0 1 class.error

# 0 55847 648 0.01147004

# 1 1543 371 0.80616510

bestmtry<-tuneRF(train_new[,-c(119)],train_new$scam_final,ntreeTry = 50, stepFactor = 1.5,improve =


0.001,trace = T,plot = T)

set.seed(123)

modelRF_new_100_16<-
randomForest(scam_final~.,data=train_new,mtry=16,ntree=100,nodesize=2,maxnodes=10000)
# randomForest(formula = scam_final ~ ., data = train_new, mtry = 16, ntree = 100, nodesize = 2,
maxnodes = 10000)

# Type of random forest: classification

# Number of trees: 100

# No. of variables tried at each split: 16

# OOB estimate of error rate: 2.99%

# Confusion matrix:

# 0 1 class.error

# 0 56984 85 0.001489425

# 1 1680 253 0.869115365

set.seed(123)

modelRF_new_50_16<-
randomForest(scam_final~.,data=train_new,mtry=16,ntree=50,nodesize=2,maxnodes=10000)

# randomForest(formula = scam_final ~ ., data = train_new, mtry = 16, ntree = 50, nodesize = 2,


maxnodes = 10000)

# Type of random forest: classification

# Number of trees: 50

# No. of variables tried at each split: 16

# OOB estimate of error rate: 3.02%

# Confusion matrix:

# 0 1 class.error

# 0 56940 129 0.002260422

# 1 1653 280 0.855147439

set.seed(123)
modelRF_new_20_16<-
randomForest(scam_final~.,data=train_new,mtry=18,ntree=20,nodesize=2,maxnodes=10000)

# randomForest(formula = scam_final ~ ., data = train_new, mtry = 18, ntree = 20, nodesize = 2,


maxnodes = 10000)

# Type of random forest: classification

# Number of trees: 20

# No. of variables tried at each split: 18

# OOB estimate of error rate: 3.11%

# Confusion matrix:

# 0 1 class.error

# 0 56820 243 0.004258451

# 1 1592 341 0.823590274

set.seed(123)

modelRF_new_20_16<-
randomForest(scam_final~.,data=train_new,cv.fold=10,mtry=18,ntree=20,nodesize=2,maxnodes=10000
)

# randomForest(formula = scam_final ~ ., data = train_new, cv.fold = 10, mtry = 18, ntree = 20,
nodesize = 2, maxnodes = 10000)

# Type of random forest: classification

# Number of trees: 20

# No. of variables tried at each split: 18

# OOB estimate of error rate: 3.11%

# Confusion matrix:

# 0 1 class.error

# 0 56820 243 0.004258451

# 1 1592 341 0.823590274


set.seed(123)

modelRF_new_15_18<-
randomForest(scam_final~.,data=train_new,cv.fold=10,mtry=18,ntree=15,nodesize=2,maxnodes=10000
)

# randomForest(formula = scam_final ~ ., data = train_new, cv.fold = 10, mtry = 18, ntree = 15,
nodesize = 2, maxnodes = 10000)

# Type of random forest: classification

# Number of trees: 15

# No. of variables tried at each split: 18

# OOB estimate of error rate: 3.28%

# Confusion matrix:

# 0 1 class.error

# 0 56649 367 0.00643679

# 1 1566 361 0.81266217

set.seed(123)

modelRF_new_10_20<-
randomForest(scam_final~.,data=train_new,cv.fold=10,mtry=20,ntree=10,nodesize=2,maxnodes=10000
)

# randomForest(formula = scam_final ~ ., data = train_new, cv.fold = 10, mtry = 20, ntree = 10,
nodesize = 2, maxnodes = 10000)

# Type of random forest: classification

# Number of trees: 10

# No. of variables tried at each split: 20

# OOB estimate of error rate: 3.77%

# Confusion matrix:
# 0 1 class.error

# 0 55841 645 0.01141876

# 1 1555 360 0.81201044

set.seed(123)

modelRF_new_10_18<-
randomForest(scam_final~.,data=train_new,cv.fold=10,mtry=18,ntree=10,nodesize=2,maxnodes=10000
)

# randomForest(formula = scam_final ~ ., data = train_new, cv.fold = 10, mtry = 18, ntree = 10,
nodesize = 2, maxnodes = 10000)

# Type of random forest: classification

# Number of trees: 10

# No. of variables tried at each split: 18

# OOB estimate of error rate: 3.75%

# Confusion matrix:

# 0 1 class.error

# 0 55847 648 0.01147004

# 1 1543 371 0.80616510

set.seed(123)

modelRF_new_10_18<-
randomForest(scam_final~.,data=train_new,cv.fold=10,mtry=18,ntree=10,nodesize=10)

# randomForest(formula = scam_final ~ ., data = train_new, cv.fold = 10, mtry = 18, ntree = 10,
nodesize = 10)

# Type of random forest: classification

# Number of trees: 10

# No. of variables tried at each split: 18

# OOB estimate of error rate: 3.48%


# Confusion matrix:

# 0 1 class.error

# 0 56020 446 0.007898558

# 1 1587 324 0.830455259

set.seed(123)

modelRF_new_10_18<-
randomForest(scam_final~.,data=train_new,cv.fold=10,mtry=18,ntree=10,nodesize=5)

# randomForest(formula = scam_final ~ ., data = train_new, cv.fold = 10, mtry = 18, ntree = 10,
nodesize = 5)

# Type of random forest: classification

# Number of trees: 10

# No. of variables tried at each split: 18

# OOB estimate of error rate: 3.61%

# Confusion matrix:

# 0 1 class.error

# 0 55943 546 0.009665599

# 1 1563 348 0.817896389

###############################################################

#####***************Performance TEST************************###

test_new<-read.csv("/Users/name/R_Directory/Scam_Model/RFM_TEST_3.csv",header =TRUE )

summary(test_new$scam_final)

table(test_new$scam_final)
cols<-
c("scam_final","device_js_enabled","device_cookie_enabled","opacket_dfp_device_new","ip_mismatch
")

for (i in cols){

test_new[,i]=as.factor(test_new[,i])

str(test_new)

####Prediction with class

pred_class_modelTest<-predict(modelRF_new_10_18,test_new,type = 'class')

t_test_new<-table(predictions=pred_class_modelTest,actual=test_new$scam_final)

#####Acurracy metrics

sum(diag(t_test_new))/sum(t_test_new)

####Plotting ROC curve and calculating AUC metrics

library(pROC)

pred_prob_modelTest<-predict(modelRF_new_10_18,test_new,type='prob')

auc_test<-auc(test_new$scam_final,pred_prob_modelTest[,2])
plot(roc(test_new$scam_final,pred_prob_modelTest[,2]))

grep("scam_final", colnames(test_new))

OutPut_test_new<-data.frame(test_new[,c(119)],predict(modelRF_new_10_18,test_new,type='prob'))

write.csv(OutPut_test_new,"/Users/name/R_Directory/Scam_Model/OutPut_test_new.csv")

#####***************Performance TEST END************************###

###############################################################

#####***************Performance OOT************************###

oot_new<-read.csv("/Users/name/R_Directory/Scam_Model/RFM_OOT_3.csv",header =TRUE )

summary(oot_new$scam_final)

table(oot_new$scam_final)

for (i in cols){

oot_new[,i]=as.factor(oot_new[,i])

str(oot_new)

table(oot_new$scam_final)

####Prediction with class

pred_class_modeloot<-predict(modelRF_new_10_18,oot_new,type = 'class')
t_oot_new<-table(predictions=pred_class_modeloot,actual=oot_new$scam_final)

#####Acurracy metrics

sum(diag(t_oot_new))/sum(t_oot_new)

####Plotting ROC curve and calculating AUC metrics

library(pROC)

pred_prob_modeloot<-predict(modelRF_new_10_18,oot_new,type='prob')

auc_oot_new<-auc(oot_new$scam_final,pred_prob_modeloot[,2])

plot(roc(oot_new$scam_final,pred_prob_modeloot[,2]))

grep("scam_final",colnames(oot_new))

OutPut_oot_new<-data.frame(oot_new[,c(119)],predict(modelRF_new_10_18,oot_new,type='prob'))

write.csv(OutPut_oot_new,"/Users/name/R_Directory/Scam_Model/OutPut_oot_new.csv")

tree<-getTree(modelRF_new_10_18,k=1,labelVar = TRUE)

tree

write.csv(tree,"tree1.csv")
###############################################################

library(h2o)

S-ar putea să vă placă și