Documente Academic
Documente Profesional
Documente Cultură
getwd()
setwd("C:/Users/name/R_Directory/Scam_Model")
# set.seed(123)
str(train_new)
dim(train_new)
head(train_new)
####Check Summary
summary(train_new$scam_final)
table(train_new$scam_final)
cor_matrix<-cor(train_new)
write.csv(cor_matrix,"cor_matrix.csv")
Un_val<-apply(train_new,2,function(x) length(unique(x)))
#####It seems there are 6 variable which is categorical, need to convert them into the factor
####device_js_enabled,device_cookie_enabled,opacket_dfp_device_new,isp_match,
####ip_mismatch,scam_final
cols<-
c("scam_final","device_js_enabled","device_cookie_enabled","opacket_dfp_device_new","ip_mismatch
")
for (i in cols){
train_new[,i]=as.factor(train_new[,i])
str(train_new)
dim(train_new)
##################Build Random forest model#######################
library(randomForest)
set.seed(123)
modelRF_new_5<-randomForest(scam_final~.,data=train_new,ntree=10)
# Number of trees: 10
# Confusion matrix:
# 0 1 class.error
######If class 1 error increases with increase in nodesize and decrese in maxnodes**********
set.seed(123)
modelRF_new_10_1<-
randomForest(scam_final~.,data=train_new,ntree=10,nodesize=2,maxnodes=10000)
# Number of trees: 10
# Confusion matrix:
# 0 1 class.error
set.seed(123)
modelRF_new_50<-randomForest(scam_final~.,data=train_new,ntree=50)
# Number of trees: 50
# Confusion matrix:
# 0 1 class.error
set.seed(123)
modelRF_new_50_1<-
randomForest(scam_final~.,data=train_new,ntree=50,nodesize=2,maxnodes=10000)
# Number of trees: 50
# 0 1 class.error
# 0 56979 90 0.001577038
set.seed(123)
modelRF_new_25_1<-
randomForest(scam_final~.,data=train_new,ntree=25,nodesize=1,maxnodes=10000)
# Number of trees: 25
# Confusion matrix:
# 0 1 class.error
set.seed(123)
modelRF_new_100_1<-
randomForest(scam_final~.,data=train_new,ntree=100,nodesize=2,maxnodes=10000)
# Confusion matrix:
# 0 1 class.error
# 0 57009 60 0.001051359
####**********************************************************************###
###***********************************************************************###
grep("scam_final", colnames(train_new))
####Find out best mtry - tuneRF - makes a step on mtry - mtry will decrease
####by 1.5 for each iteration- if there is improve on the model by 0.01 then
# 0.02349072 0.01
# 0.03096187 0.01
# 0.02621193 0.01
# 0.03306647 0.01
# -0.1744363 0.01
print(bestmtry)
print(best.m)
# mtry OOBError
# 2.OOB 2 0.03316327
# 3.OOB 3 0.03429736
# 4.OOB 4 0.03522056
# 6.OOB 6 0.03634589
# 8.OOB 8 0.03743389
# 11.OOB 11 0.03833439
# 16.OOB 16 0.03894814
# > print(best.m)
# [1] 2
set.seed(123)
modelRF_new_10_16<-
randomForest(scam_final~.,data=train_new,mtry=16,ntree=10,nodesize=2,maxnodes=10000)
# randomForest(formula = scam_final ~ ., data = train_new, mtry = 16, ntree = 10, nodesize = 2,
maxnodes = 10000)
# Number of trees: 10
# Confusion matrix:
# 0 1 class.error
set.seed(123)
modelRF_new_30_16<-
randomForest(scam_final~.,data=train_new,mtry=16,ntree=30,nodesize=2,maxnodes=10000)
# Number of trees: 30
# Confusion matrix:
# 0 1 class.error
print(best.m)
######*************************************Final
Model*********************************#####
set.seed(123)
modelRF_new_10_16<-
randomForest(scam_final~.,data=train_new,mtry=16,ntree=10,nodesize=2,maxnodes=10000)
# Number of trees: 10
# Confusion matrix:
# 0 1 class.error
set.seed(123)
modelRF_new_10_14<-
randomForest(scam_final~.,data=train_new,mtry=14,ntree=10,nodesize=2,maxnodes=10000)
# Number of trees: 10
# Confusion matrix:
# 0 1 class.error
set.seed(123)
modelRF_new_10_18<-
randomForest(scam_final~.,data=train_new,mtry=18,ntree=10,nodesize=2,maxnodes=10000)
# Number of trees: 10
# Confusion matrix:
# 0 1 class.error
set.seed(123)
modelRF_new_100_16<-
randomForest(scam_final~.,data=train_new,mtry=16,ntree=100,nodesize=2,maxnodes=10000)
# randomForest(formula = scam_final ~ ., data = train_new, mtry = 16, ntree = 100, nodesize = 2,
maxnodes = 10000)
# Confusion matrix:
# 0 1 class.error
# 0 56984 85 0.001489425
set.seed(123)
modelRF_new_50_16<-
randomForest(scam_final~.,data=train_new,mtry=16,ntree=50,nodesize=2,maxnodes=10000)
# Number of trees: 50
# Confusion matrix:
# 0 1 class.error
set.seed(123)
modelRF_new_20_16<-
randomForest(scam_final~.,data=train_new,mtry=18,ntree=20,nodesize=2,maxnodes=10000)
# Number of trees: 20
# Confusion matrix:
# 0 1 class.error
set.seed(123)
modelRF_new_20_16<-
randomForest(scam_final~.,data=train_new,cv.fold=10,mtry=18,ntree=20,nodesize=2,maxnodes=10000
)
# randomForest(formula = scam_final ~ ., data = train_new, cv.fold = 10, mtry = 18, ntree = 20,
nodesize = 2, maxnodes = 10000)
# Number of trees: 20
# Confusion matrix:
# 0 1 class.error
modelRF_new_15_18<-
randomForest(scam_final~.,data=train_new,cv.fold=10,mtry=18,ntree=15,nodesize=2,maxnodes=10000
)
# randomForest(formula = scam_final ~ ., data = train_new, cv.fold = 10, mtry = 18, ntree = 15,
nodesize = 2, maxnodes = 10000)
# Number of trees: 15
# Confusion matrix:
# 0 1 class.error
set.seed(123)
modelRF_new_10_20<-
randomForest(scam_final~.,data=train_new,cv.fold=10,mtry=20,ntree=10,nodesize=2,maxnodes=10000
)
# randomForest(formula = scam_final ~ ., data = train_new, cv.fold = 10, mtry = 20, ntree = 10,
nodesize = 2, maxnodes = 10000)
# Number of trees: 10
# Confusion matrix:
# 0 1 class.error
set.seed(123)
modelRF_new_10_18<-
randomForest(scam_final~.,data=train_new,cv.fold=10,mtry=18,ntree=10,nodesize=2,maxnodes=10000
)
# randomForest(formula = scam_final ~ ., data = train_new, cv.fold = 10, mtry = 18, ntree = 10,
nodesize = 2, maxnodes = 10000)
# Number of trees: 10
# Confusion matrix:
# 0 1 class.error
set.seed(123)
modelRF_new_10_18<-
randomForest(scam_final~.,data=train_new,cv.fold=10,mtry=18,ntree=10,nodesize=10)
# randomForest(formula = scam_final ~ ., data = train_new, cv.fold = 10, mtry = 18, ntree = 10,
nodesize = 10)
# Number of trees: 10
# 0 1 class.error
set.seed(123)
modelRF_new_10_18<-
randomForest(scam_final~.,data=train_new,cv.fold=10,mtry=18,ntree=10,nodesize=5)
# randomForest(formula = scam_final ~ ., data = train_new, cv.fold = 10, mtry = 18, ntree = 10,
nodesize = 5)
# Number of trees: 10
# Confusion matrix:
# 0 1 class.error
###############################################################
#####***************Performance TEST************************###
test_new<-read.csv("/Users/name/R_Directory/Scam_Model/RFM_TEST_3.csv",header =TRUE )
summary(test_new$scam_final)
table(test_new$scam_final)
cols<-
c("scam_final","device_js_enabled","device_cookie_enabled","opacket_dfp_device_new","ip_mismatch
")
for (i in cols){
test_new[,i]=as.factor(test_new[,i])
str(test_new)
pred_class_modelTest<-predict(modelRF_new_10_18,test_new,type = 'class')
t_test_new<-table(predictions=pred_class_modelTest,actual=test_new$scam_final)
#####Acurracy metrics
sum(diag(t_test_new))/sum(t_test_new)
library(pROC)
pred_prob_modelTest<-predict(modelRF_new_10_18,test_new,type='prob')
auc_test<-auc(test_new$scam_final,pred_prob_modelTest[,2])
plot(roc(test_new$scam_final,pred_prob_modelTest[,2]))
grep("scam_final", colnames(test_new))
OutPut_test_new<-data.frame(test_new[,c(119)],predict(modelRF_new_10_18,test_new,type='prob'))
write.csv(OutPut_test_new,"/Users/name/R_Directory/Scam_Model/OutPut_test_new.csv")
###############################################################
#####***************Performance OOT************************###
oot_new<-read.csv("/Users/name/R_Directory/Scam_Model/RFM_OOT_3.csv",header =TRUE )
summary(oot_new$scam_final)
table(oot_new$scam_final)
for (i in cols){
oot_new[,i]=as.factor(oot_new[,i])
str(oot_new)
table(oot_new$scam_final)
pred_class_modeloot<-predict(modelRF_new_10_18,oot_new,type = 'class')
t_oot_new<-table(predictions=pred_class_modeloot,actual=oot_new$scam_final)
#####Acurracy metrics
sum(diag(t_oot_new))/sum(t_oot_new)
library(pROC)
pred_prob_modeloot<-predict(modelRF_new_10_18,oot_new,type='prob')
auc_oot_new<-auc(oot_new$scam_final,pred_prob_modeloot[,2])
plot(roc(oot_new$scam_final,pred_prob_modeloot[,2]))
grep("scam_final",colnames(oot_new))
OutPut_oot_new<-data.frame(oot_new[,c(119)],predict(modelRF_new_10_18,oot_new,type='prob'))
write.csv(OutPut_oot_new,"/Users/name/R_Directory/Scam_Model/OutPut_oot_new.csv")
tree<-getTree(modelRF_new_10_18,k=1,labelVar = TRUE)
tree
write.csv(tree,"tree1.csv")
###############################################################
library(h2o)