Sunteți pe pagina 1din 7

library(class)

library(missForest)

library(naivebayes)

# GETTING DATA AND VIEWING

train=read.csv(file.choose(),sep=',',header=T)

str(train)

summary(train)

is.na(train)

nrow(train[!complete.cases(train),])

# DATA CLEANING AND PREPARATION

# CONVERTING COLUMNS TO FACTOR

count=0

for(i in 1:ncol(train))

if((class(train[,i])=='numeric')||(class(train[,i])=='integer'))

if((min(train[,i],na.rm=T))==quantile(train[,i],0.25,na.rm=T)||quantile(train[,i],0.25,na.rm=T)==quantile(t
rain[,i],0.5,na.rm=T)||quantile(train[,i],0.5,na.rm=T)==quantile(train[,i],0.75,na.rm=T)||quantile(train[,i]
,0.75,na.rm=T)==quantile(train[,i],1,na.rm=T))

train[,i]=factor(train[,i])

print(paste(names(train[i]),": CHANGED TO FACTOR"))

count=count+1

}else{
train[,i]=train[,i]

print(paste(names(train[i]),": IS REAL NUMBER"))

}else if((class(train[,i])=='factor')){

print("ALREADY IN FACTOR")

}else{

print("NOT NUMERIC")

print(paste("**** NO. OF ATTRIBUTES ARE COVERTED TO FACTOR:",count ,"****"))

summary(train)

train=train[c(-1,-4,-9,-11)]

train$Survived=factor(train$Survived,labels=c('NO','YES'))

train$Sex=factor(train$Sex,labels=c(0,1),levels = c('female','male'))

table(train$Sex)

prop.table(table(train$Sex))*100

table(train$Survived)

prop.table(table(train$Survived))*100

#removing outliers

data<-function(x){

OUTLIERS=NULL

for (i in 1:ncol(x))

if(class(x[,i])=='numeric'||class(x[,i])=='integer')

if (length((boxplot(x[,i])$out))==0)

{
print ('NO OUTLIERS')

}else {

print ('OUTLIERS')

OUTLIERS=boxplot(x[,i], plot=FALSE)$out

x[which(x[,i] %in% OUTLIERS),i]=NA

OUTLIERS=NULL

}else{

print ("NOT NUMERIC")

return(x)

train<-data(train)

summary(train)

# REMOVING NA

data1<-function(y){

if (nrow(y[!complete.cases(y),])==0)

DATANONA1=y

}else{

DATAMISSFOREST <- missForest(y)

DATANONA1=as.data.frame(DATAMISSFOREST[[1]])

return(DATANONA1)

train<-data1(train)

summary(train)
tr=train

# NORMALIZE DATASET

Normalize <- function(x) {

return ((x - min(x)) / (max(x) - min(x)))

train=as.data.frame(lapply(tr[c(4,7)],Normalize))

ncol(train)

summary(train)

train=cbind(tr[c(1:3,5,6)],train)

summary(train)

View(train)

#DATA SUBSETTING AND PREPARATION

set.seed(1234)

ind <- sample(2, nrow(train), replace = T, prob = c(0.7, 0.3))

TRAINING= train[ind==1,]

TESTING= train[ind==2,]

summary(TRAINING)

TRAINING_LABEL=TRAINING[,1]

TESTING_LABEL=TESTING[,1]

TRAINING_KNN=TRAINING[,-1]

TESTING_KNN=TESTING[,-1]

summary(TRAINING_KNN)

ncol(TESTING_KNN)

summary(TRAINING_LABEL)
#CREATING KNN MODEL

NEARESTNEIGHBOUR=round(sqrt(nrow(train)))

print(NEARESTNEIGHBOUR)

df=NULL

for (i in 1:NEARESTNEIGHBOUR)

set.seed(1234)

PREDICTION=knn(train=TRAINING_KNN,test=TESTING_KNN,cl=TRAINING_LABEL,k=i)

VALIDATION=table(TESTING_LABEL,PREDICTION)

(ACCURACY=sum(diag(VALIDATION))/sum(VALIDATION)*100)

print(paste("When Nearest neighbour= ",i,"Then Accuracy = ",ACCURACY))

df=rbind(df,data.frame(K=i,Acc=ACCURACY))

# SELECTING BEST VALUE OF K

MAXK=subset(df,Acc==max(Acc),select=K)

MAXK

if (length(MAXK$K)>1)

for (i in 1:length(MAXK$K))

if(MAXK[i,] %% 2==1){

K=MAXK[i,]

break
}else{

K=MAXK[1,]

}else{

K=MAXK

print(K)

# FIXED MODEL

set.seed(1234)

PREDICTION=knn(train=TRAINING_KNN,test=TESTING_KNN,cl=TRAINING_LABEL,k=K)

# CROSS VALIDATION

(VALIDATION=table(TESTING_LABEL,PREDICTION))

# CHECKING ACCURACY PERCENTAGE

(ACCURACY=sum(diag(VALIDATION))/sum(VALIDATION)*100)

#CREATING NAIVE BAYES MODEL

MODEL=naive_bayes(Survived~.,data=TRAINING)

plot(MODEL)

PREDICTIONPROB=predict(MODEL,TESTING,type="prob")

head(cbind(PREDICTIONPROB,TESTING))

PREDICTION=predict(MODEL,TESTING)

# CROSS VALIDATION

(VALIDATION=table(TEST=TESTING$Survived,PREDICTED=PREDICTION))
# CHECKING ACCURACY PERCENTAGE

(ACCURACY=sum(diag(VALIDATION))/sum(VALIDATION)*100)

#CREATING GLM MODEL

mod2<-glm(Survived~Pclass+Sex+Age,data=TRAINING,family="binomial")

summary(mod2)

PREDICTION=(predict(mod2,TESTING[]))

DIFFERENCE=abs(TESTING[,1]-PREDICTION)

ERROR_PERCENTAGE=DIFFERENCE/TESTING[,1]*100

ACCURACY_PERCENTAGE=100-ERROR_PERCENTAGE

(cbind(ACTUAL=TESTING[,1],PREDICTED=PREDICTION,ERROR=DIFFERENCE,ERROR_PERCENTAGE,ACCUR
ACY_PERCENTAGE))

#CHECKING ACCURACY

(FINAL_ACCURACY=mean(ACCURACY_PERCENTAGE))

S-ar putea să vă placă și