Documente Academic
Documente Profesional
Documente Cultură
library(missForest)
library(naivebayes)
train=read.csv(file.choose(),sep=',',header=T)
str(train)
summary(train)
is.na(train)
nrow(train[!complete.cases(train),])
count=0
for(i in 1:ncol(train))
if((class(train[,i])=='numeric')||(class(train[,i])=='integer'))
if((min(train[,i],na.rm=T))==quantile(train[,i],0.25,na.rm=T)||quantile(train[,i],0.25,na.rm=T)==quantile(t
rain[,i],0.5,na.rm=T)||quantile(train[,i],0.5,na.rm=T)==quantile(train[,i],0.75,na.rm=T)||quantile(train[,i]
,0.75,na.rm=T)==quantile(train[,i],1,na.rm=T))
train[,i]=factor(train[,i])
count=count+1
}else{
train[,i]=train[,i]
}else if((class(train[,i])=='factor')){
print("ALREADY IN FACTOR")
}else{
print("NOT NUMERIC")
summary(train)
train=train[c(-1,-4,-9,-11)]
train$Survived=factor(train$Survived,labels=c('NO','YES'))
train$Sex=factor(train$Sex,labels=c(0,1),levels = c('female','male'))
table(train$Sex)
prop.table(table(train$Sex))*100
table(train$Survived)
prop.table(table(train$Survived))*100
#removing outliers
data<-function(x){
OUTLIERS=NULL
for (i in 1:ncol(x))
if(class(x[,i])=='numeric'||class(x[,i])=='integer')
if (length((boxplot(x[,i])$out))==0)
{
print ('NO OUTLIERS')
}else {
print ('OUTLIERS')
OUTLIERS=boxplot(x[,i], plot=FALSE)$out
OUTLIERS=NULL
}else{
return(x)
train<-data(train)
summary(train)
# REMOVING NA
data1<-function(y){
if (nrow(y[!complete.cases(y),])==0)
DATANONA1=y
}else{
DATANONA1=as.data.frame(DATAMISSFOREST[[1]])
return(DATANONA1)
train<-data1(train)
summary(train)
tr=train
# NORMALIZE DATASET
train=as.data.frame(lapply(tr[c(4,7)],Normalize))
ncol(train)
summary(train)
train=cbind(tr[c(1:3,5,6)],train)
summary(train)
View(train)
set.seed(1234)
TRAINING= train[ind==1,]
TESTING= train[ind==2,]
summary(TRAINING)
TRAINING_LABEL=TRAINING[,1]
TESTING_LABEL=TESTING[,1]
TRAINING_KNN=TRAINING[,-1]
TESTING_KNN=TESTING[,-1]
summary(TRAINING_KNN)
ncol(TESTING_KNN)
summary(TRAINING_LABEL)
#CREATING KNN MODEL
NEARESTNEIGHBOUR=round(sqrt(nrow(train)))
print(NEARESTNEIGHBOUR)
df=NULL
for (i in 1:NEARESTNEIGHBOUR)
set.seed(1234)
PREDICTION=knn(train=TRAINING_KNN,test=TESTING_KNN,cl=TRAINING_LABEL,k=i)
VALIDATION=table(TESTING_LABEL,PREDICTION)
(ACCURACY=sum(diag(VALIDATION))/sum(VALIDATION)*100)
df=rbind(df,data.frame(K=i,Acc=ACCURACY))
MAXK=subset(df,Acc==max(Acc),select=K)
MAXK
if (length(MAXK$K)>1)
for (i in 1:length(MAXK$K))
if(MAXK[i,] %% 2==1){
K=MAXK[i,]
break
}else{
K=MAXK[1,]
}else{
K=MAXK
print(K)
# FIXED MODEL
set.seed(1234)
PREDICTION=knn(train=TRAINING_KNN,test=TESTING_KNN,cl=TRAINING_LABEL,k=K)
# CROSS VALIDATION
(VALIDATION=table(TESTING_LABEL,PREDICTION))
(ACCURACY=sum(diag(VALIDATION))/sum(VALIDATION)*100)
MODEL=naive_bayes(Survived~.,data=TRAINING)
plot(MODEL)
PREDICTIONPROB=predict(MODEL,TESTING,type="prob")
head(cbind(PREDICTIONPROB,TESTING))
PREDICTION=predict(MODEL,TESTING)
# CROSS VALIDATION
(VALIDATION=table(TEST=TESTING$Survived,PREDICTED=PREDICTION))
# CHECKING ACCURACY PERCENTAGE
(ACCURACY=sum(diag(VALIDATION))/sum(VALIDATION)*100)
mod2<-glm(Survived~Pclass+Sex+Age,data=TRAINING,family="binomial")
summary(mod2)
PREDICTION=(predict(mod2,TESTING[]))
DIFFERENCE=abs(TESTING[,1]-PREDICTION)
ERROR_PERCENTAGE=DIFFERENCE/TESTING[,1]*100
ACCURACY_PERCENTAGE=100-ERROR_PERCENTAGE
(cbind(ACTUAL=TESTING[,1],PREDICTED=PREDICTION,ERROR=DIFFERENCE,ERROR_PERCENTAGE,ACCUR
ACY_PERCENTAGE))
#CHECKING ACCURACY
(FINAL_ACCURACY=mean(ACCURACY_PERCENTAGE))