RandomForest H2O

#####Check the directory
getwd()
###set the working Directory
setwd("C:/Users/name/R_Directory/Scam_Model")
# set.seed(123)
###Import Train CSV file
train_new <- read.csv("C:/Users/name/R_Directory/Scam_Model/RFM_TRAIN_3.csv", header=TRUE)
####Check format of the dataframe
str(train_new)
#####Check diamension of the dataframe
dim(train_new)
####Get top 3 raws of the dataset
head(train_new)
####Check Summary
summary(train_new$scam_final)
table(train_new$scam_final)
cor_matrix<-cor(train_new)
write.csv(cor_matrix,"cor_matrix.csv")
#####Check the number of unique values
Un_val<-apply(train_new,2,function(x) length(unique(x)))
#####It seems there are 6 variable which is categorical, need to convert them into the factor
####device_js_enabled,device_cookie_enabled,opacket_dfp_device_new,isp_match,
####ip_mismatch,scam_final
cols<-
c("scam_final","device_js_enabled","device_cookie_enabled","opacket_dfp_device_new","ip_mismatch
")
for (i in cols){
train_new[,i]=as.factor(train_new[,i])
str(train_new)
dim(train_new)
##################Build Random forest model#######################
library(randomForest)
#####Get column index number of Response variable###############
set.seed(123)
modelRF_new_5<-randomForest(scam_final~.,data=train_new,ntree=10)
# randomForest(formula = scam_final ~ ., data = train_new, ntree = 10)
# Type of random forest: classification
# Number of trees: 10
# No. of variables tried at each split: 11
# OOB estimate of error rate: 3.78%
# Confusion matrix:
# 0 1 class.error
# 0 55885 645 0.01140987
# 1 1562 352 0.81609195
######If class 1 error increases with increase in nodesize and decrese in maxnodes**********
set.seed(123)
modelRF_new_10_1<-
randomForest(scam_final~.,data=train_new,ntree=10,nodesize=2,maxnodes=10000)
# randomForest(formula = scam_final ~ ., data = train_new, ntree = 10, nodesize = 2, maxnodes =

10000)

# Confusion matrix:
# 0 1 class.error
# 0 55920 612 0.01082573
# 1 1562 355 0.81481481
set.seed(123)
modelRF_new_50<-randomForest(scam_final~.,data=train_new,ntree=50)
# randomForest(formula = scam_final ~ ., data = train_new, ntree = 50)
# Confusion matrix:
# 0 1 class.error
# 0 56960 109 0.001909969
# 1 1680 253 0.869115365
set.seed(123)
modelRF_new_50_1<-

10000)

# Confusion matrix:
# 0 1 class.error
# 0 56979 90 0.001577038
# 1 1696 237 0.877392654
set.seed(123)
modelRF_new_25_1<-

10000)
# Confusion matrix:
# 0 1 class.error
# 0 56908 161 0.002821146
# 1 1649 284 0.853078117
set.seed(123)
modelRF_new_100_1<-

10000)

# Confusion matrix:
# 0 1 class.error
# 0 57009 60 0.001051359
# 1 1700 233 0.879461976
####**********************************************************************###
###Ntree=10 is the optimal ntree value************************************###
###***********************************************************************###
#####Get column index number of Response variable###############
grep("scam_final", colnames(train_new))
####Find out best mtry - tuneRF - makes a step on mtry - mtry will decrease
####by 1.5 for each iteration- if there is improve on the model by 0.01 then
####it will go ahead else it will stop
bestmtry<-tuneRF(train_new[,-c(119)],train_new$scam_final,ntreeTry = 10, stepFactor = 1.5,improve =

0.01,trace = T,plot = T)
# mtry = 11 OOB error = 3.83%
# Searching left ...
# 0.02349072 0.01

# 0.0290644 0.01
# 0.03096187 0.01
# 0.02621193 0.01
# 0.03306647 0.01
# Searching right ...
# -0.1744363 0.01
best.m <- bestmtry[bestmtry[, 2] == min(bestmtry[, 2]), 1]
print(bestmtry)
print(best.m)
# mtry OOBError
# 2.OOB 2 0.03316327
# 3.OOB 3 0.03429736
# 4.OOB 4 0.03522056
# 6.OOB 6 0.03634589
# 8.OOB 8 0.03743389
# 11.OOB 11 0.03833439
# 16.OOB 16 0.03894814
# > print(best.m)
# [1] 2
set.seed(123)
modelRF_new_10_16<-
randomForest(scam_final~.,data=train_new,mtry=16,ntree=10,nodesize=2,maxnodes=10000)
# randomForest(formula = scam_final ~ ., data = train_new, mtry = 16, ntree = 10, nodesize = 2,
maxnodes = 10000)
# Confusion matrix:
# 0 1 class.error
# 0 55893 627 0.01109342
# 1 1563 345 0.81918239
set.seed(123)
modelRF_new_30_16<-

maxnodes = 10000)
# Confusion matrix:
# 0 1 class.error
# 0 56899 170 0.00297885
# 1 1657 276 0.85721676

print(bestmtry)
print(best.m)
######*************************************Final
Model*********************************#####
set.seed(123)
modelRF_new_10_16<-

maxnodes = 10000)
# Confusion matrix:
# 0 1 class.error
# 0 55893 627 0.01109342
# 1 1563 345 0.81918239
set.seed(123)
modelRF_new_10_14<-

maxnodes = 10000)
# Confusion matrix:
# 0 1 class.error
# 0 55901 587 0.01039159
# 1 1572 339 0.82260597
set.seed(123)
modelRF_new_10_18<-

maxnodes = 10000)
# Confusion matrix:
# 0 1 class.error
# 0 55847 648 0.01147004
# 1 1543 371 0.80616510

set.seed(123)
modelRF_new_100_16<-
maxnodes = 10000)
# Confusion matrix:
# 0 1 class.error
# 0 56984 85 0.001489425
# 1 1680 253 0.869115365
set.seed(123)
modelRF_new_50_16<-

maxnodes = 10000)
# Confusion matrix:
# 0 1 class.error
# 0 56940 129 0.002260422
# 1 1653 280 0.855147439
set.seed(123)
modelRF_new_20_16<-

maxnodes = 10000)
# Confusion matrix:
# 0 1 class.error
# 0 56820 243 0.004258451
# 1 1592 341 0.823590274
set.seed(123)
modelRF_new_20_16<-
randomForest(scam_final~.,data=train_new,cv.fold=10,mtry=18,ntree=20,nodesize=2,maxnodes=10000
)
# randomForest(formula = scam_final ~ ., data = train_new, cv.fold = 10, mtry = 18, ntree = 20,
nodesize = 2, maxnodes = 10000)
# Confusion matrix:
# 0 1 class.error
# 0 56820 243 0.004258451
# 1 1592 341 0.823590274

set.seed(123)
modelRF_new_15_18<-
)
# Confusion matrix:
# 0 1 class.error
# 0 56649 367 0.00643679
# 1 1566 361 0.81266217
set.seed(123)
modelRF_new_10_20<-
)
# Confusion matrix:
# 0 1 class.error
# 0 55841 645 0.01141876
# 1 1555 360 0.81201044
set.seed(123)
modelRF_new_10_18<-
)
# Confusion matrix:
# 0 1 class.error
# 0 55847 648 0.01147004
# 1 1543 371 0.80616510
set.seed(123)
modelRF_new_10_18<-
randomForest(scam_final~.,data=train_new,cv.fold=10,mtry=18,ntree=10,nodesize=10)
nodesize = 10)

# Confusion matrix:
# 0 1 class.error
# 0 56020 446 0.007898558
# 1 1587 324 0.830455259
set.seed(123)
modelRF_new_10_18<-
randomForest(scam_final~.,data=train_new,cv.fold=10,mtry=18,ntree=10,nodesize=5)
nodesize = 5)
# Confusion matrix:
# 0 1 class.error
# 0 55943 546 0.009665599
# 1 1563 348 0.817896389
###############################################################
#####***************Performance TEST************************###
test_new<-read.csv("/Users/name/R_Directory/Scam_Model/RFM_TEST_3.csv",header =TRUE )
summary(test_new$scam_final)
table(test_new$scam_final)
cols<-
c("scam_final","device_js_enabled","device_cookie_enabled","opacket_dfp_device_new","ip_mismatch
")
for (i in cols){
test_new[,i]=as.factor(test_new[,i])
str(test_new)
####Prediction with class
pred_class_modelTest<-predict(modelRF_new_10_18,test_new,type = 'class')
t_test_new<-table(predictions=pred_class_modelTest,actual=test_new$scam_final)
#####Acurracy metrics
sum(diag(t_test_new))/sum(t_test_new)
####Plotting ROC curve and calculating AUC metrics
library(pROC)
pred_prob_modelTest<-predict(modelRF_new_10_18,test_new,type='prob')
auc_test<-auc(test_new$scam_final,pred_prob_modelTest[,2])
plot(roc(test_new$scam_final,pred_prob_modelTest[,2]))
grep("scam_final", colnames(test_new))
OutPut_test_new<-data.frame(test_new[,c(119)],predict(modelRF_new_10_18,test_new,type='prob'))
write.csv(OutPut_test_new,"/Users/name/R_Directory/Scam_Model/OutPut_test_new.csv")
#####***************Performance TEST END************************###
###############################################################
#####***************Performance OOT************************###
oot_new<-read.csv("/Users/name/R_Directory/Scam_Model/RFM_OOT_3.csv",header =TRUE )
summary(oot_new$scam_final)
table(oot_new$scam_final)
for (i in cols){
oot_new[,i]=as.factor(oot_new[,i])
str(oot_new)
table(oot_new$scam_final)
####Prediction with class
pred_class_modeloot<-predict(modelRF_new_10_18,oot_new,type = 'class')
t_oot_new<-table(predictions=pred_class_modeloot,actual=oot_new$scam_final)
#####Acurracy metrics
sum(diag(t_oot_new))/sum(t_oot_new)
####Plotting ROC curve and calculating AUC metrics
library(pROC)
pred_prob_modeloot<-predict(modelRF_new_10_18,oot_new,type='prob')
auc_oot_new<-auc(oot_new$scam_final,pred_prob_modeloot[,2])
plot(roc(oot_new$scam_final,pred_prob_modeloot[,2]))
grep("scam_final",colnames(oot_new))
OutPut_oot_new<-data.frame(oot_new[,c(119)],predict(modelRF_new_10_18,oot_new,type='prob'))
write.csv(OutPut_oot_new,"/Users/name/R_Directory/Scam_Model/OutPut_oot_new.csv")
tree<-getTree(modelRF_new_10_18,k=1,labelVar = TRUE)
tree
write.csv(tree,"tree1.csv")
###############################################################
library(h2o)

RandomForest H2O

Încărcat de

Informații document

Titlu original

Drepturi de autor

Formate disponibile

Partajați acest document

Partajați sau inserați document

Opțiuni de partajare

Vi se pare util acest document?

Este necorespunzător acest conținut?

Drepturi de autor:

Formate disponibile

RandomForest H2O

Încărcat de

Drepturi de autor:

Formate disponibile

#####Check the directory

###set the working Directory

###Import Train CSV file

train_new <- read.csv("C:/Users/name/R_Directory/Scam_Model/RFM_TRAIN_3.csv", header=TRUE)

####Check format of the dataframe

#####Check diamension of the dataframe

####Get top 3 raws of the dataset

#####Check the number of unique values

#####Get column index number of Response variable###############

# randomForest(formula = scam_final ~ ., data = train_new, ntree = 10)

# Type of random forest: classification

# No. of variables tried at each split: 11

# OOB estimate of error rate: 3.78%

# 0 55885 645 0.01140987

# 1 1562 352 0.81609195

# randomForest(formula = scam_final ~ ., data = train_new, ntree = 10, nodesize = 2, maxnodes =

# Type of random forest: classification

# No. of variables tried at each split: 11

# 0 55920 612 0.01082573

# 1 1562 355 0.81481481

# randomForest(formula = scam_final ~ ., data = train_new, ntree = 50)

# Type of random forest: classification

# No. of variables tried at each split: 11

# OOB estimate of error rate: 3.03%

# 0 56960 109 0.001909969

# 1 1680 253 0.869115365

# randomForest(formula = scam_final ~ ., data = train_new, ntree = 50, nodesize = 2, maxnodes =

# Type of random forest: classification

# No. of variables tried at each split: 11

# OOB estimate of error rate: 3.03%

# 1 1696 237 0.877392654

# randomForest(formula = scam_final ~ ., data = train_new, ntree = 25, nodesize = 2, maxnodes =

# Type of random forest: classification

# No. of variables tried at each split: 11

# OOB estimate of error rate: 3.07%

# 0 56908 161 0.002821146

# 1 1649 284 0.853078117

# randomForest(formula = scam_final ~ ., data = train_new, ntree = 100, nodesize = 2, maxnodes =

# Type of random forest: classification

# Number of trees: 100

# No. of variables tried at each split: 11

# 1 1700 233 0.879461976

###Ntree=10 is the optimal ntree value************************************###

#####Get column index number of Response variable###############

####it will go ahead else it will stop

bestmtry<-tuneRF(train_new[,-c(119)],train_new$scam_final,ntreeTry = 10, stepFactor = 1.5,improve =

# mtry = 11 OOB error = 3.83%

# Searching left ...

# mtry = 8 OOB error = 3.74%

# mtry = 6 OOB error = 3.63%

# mtry = 4 OOB error = 3.52%

# mtry = 3 OOB error = 3.43%

# mtry = 2 OOB error = 3.32%

# Searching right ...

# mtry = 16 OOB error = 3.89%

best.m <- bestmtry[bestmtry[, 2] == min(bestmtry[, 2]), 1]

# Type of random forest: classification

# No. of variables tried at each split: 16

# OOB estimate of error rate: 3.75%

# 0 55893 627 0.01109342

# 1 1563 345 0.81918239

# randomForest(formula = scam_final ~ ., data = train_new, mtry = 16, ntree = 30, nodesize = 2,

# Type of random forest: classification