Sunteți pe pagina 1din 14

# This script uses CART to do estimation

# Uses the Cereals dataset


# Load cereals.csv
dc<-read.csv("cereals.csv")

# Remove the fields "Cereals" and "shelf"


fld<-!(names(dc) %in% c("Cereals","shelf"))
dcm<-dc[,fld]

# Partition into training and validation datasets


set.seed(34)
r<-sample(seq_len(nrow(dc)),0.7*nrow(dc),replace=F)
dt<-dcm[r,]
dv<-dcm[-r,]

# Use rpart to create the CART


library(rpart)
m.cart<-
rpart(rating~.,data=dt,parm=list(split="information"),control=rpart.control
(minsplit=2,minbucket=3))

# See the model output


summary(m.cart)

# Plot the tree


plot(m.cart,uniform=T,margin=.2)
text(m.cart,cex=.8, pretty=0)

# Get the ps plot for better visibility


post(m.cart,file="cereals.ps")

# Check where to prune

# Use the Cp and X validation plot


plotcp(m.cart)

# use r square
rsq.rpart(m.cart)

# Use the model to predict the rating for validation data


m.val<-predict(m.cart, newdata=dv)

# Get the difference between the predicted and observed values of rating
diff<-dv$rating-m.val

# Plot the result


plot(dv$rating,diff,pch=20, col="red")
abline(h=0, col="blue")

# Prepare to get the scored data into a csv file


output<-data.frame(list(m.val,dv$rating))
colnames(output)<-c("Predicted","Observed")
write.csv(output,"score.csv",row.names=F)

# Use RandomForest to compare results


library(randomForest)
m.rf<-randomForest(rating~.,dt)
varImpPlot(m.rf)
# Predict using the RandomForest Model
p.rf<-predict(m.rf,newdata=dv)

# Write the score of both the models along with observed data for
comparison
output<-data.frame(list(m.val,p.rf,dv$rating))
colnames(output)<-c("PredictedCART","PredictedRF","Observed")
write.csv(output,"score.csv",row.names=F)

# This script is for a classifier using CART


library(rpart)

# Read the play.csv dataset


d<-read.csv("Play.csv ")

# Use the rpart function to generate the CART


m<-
rpart(Play~.,d,method="class",parms=list(split="information"),control=rpart
.control(minsplit=2,minbucket=1))

# The following rpart includes a prior proportion of the two classes


#rpart(Play~.,d,method="class",parms=list(prior=c(.8,.2),split="information
"),control=rpart.control(minsplit=2,minbucket=1))

# See the rules generated by the CART


print(summary(m))

# See the complexity at each partition


printcp(m)

# Pruning the tree based on the minimum of cross validation error 'xerror'
# This is commented here as there will be only the root node if the pruning
is done in this example
#pm<-prune(m,cp=m$cptable[which.min(m$cptable[,"xerror"]),"CP"])

# Plot the tree to see the same visually


plot(m,uniform=T,branch=1,margin=.2)
text(m,cex=.6, pretty=0)

# Following will generate a more decorative tree in ps format


post(m,file="tree.ps")

# Predicting on the training dataset, as no validation dataset was defined.


# Predict will compute the classes of the given records
p<-predict(m,dat=d)

# Get the confusion matrix


print(table(d$Play,p[,2]))

# This script performs Logistic Regression on the Universal Bank Dataset


# The dependent variable PersonalLoan is predicted based on single
predictor Income

# Use Library Caret


library(caret)

# Read the dataset into a data frame


d<-read.csv("UniversalBank.csv")

# Convert PersonalLoan Column to factor


d$PersonalLoan=as.factor(as.character(d$PersonalLoan))

# Divide into training and validation partition


set.seed(53)
r<-createDataPartition(y=d$PersonalLoan,p=0.7,list=F)
dt<-d[r,]
dv<-d[-r,]

# Run Logistic Regression on the training dataset using Income as predictor


and PersonalLoan as outcome variable
m.lr<-train(PersonalLoan~Income, data=dt, method="glm",family="binomial")
summary(m.lr)$coef
summary(m.lr)

# Predict the result on validation data


fit.m=predict(m.lr,dv)

confusionMatrix(fit.m[,2],dv$PersonalLoan, positive = "1")

# Draw ROC Curve


library("ROCR")
fit.m=predict(m.lr,dv, type="prob")
pred<-prediction(fit.m[2],dv$PersonalLoan)
perf<-performance(pred,"tpr","fpr")
plot(perf,main="ROC Curve",colorize=T)

# As the ROC Curve shows that high TPR can be achieved at low
# cutoff probabilities, draw confusion matrix accordingly
predcton<-ifelse(fit.m[2]>0.2,1,0)
print("At cutoff=0.2")
print(table(dv$PersonalLoan,predcton))

# Try with even lower cutoff at the cost of FPR


predcton<-ifelse(fit.m[2]>0.22,1,0)
print("At cutoff=0.22")
print(table(dv$PersonalLoan,predcton))

# Draw lift charts


library(lift)
plotLift(m.vl,dv$PersonalLoan)
plotLift(m.vl,dv$PersonalLoan,cumulative = F)

plotLift(predcton,dv$PersonalLoan)
plotLift(predcton,dv$PersonalLoan,cumulative = F)

# Copy the data to csv file for further analysis


dout=data.frame(Outcome=predcton)
dv$Outcome=dout$Outcome
write.csv(dv,"PredUniversalBank.csv",row.names = F)

# This script performs Logistic Regression on the Universal Bank Dataset


# The dependent variable PersonalLoan is predicted based on single
predictor Income
# Read the dataset into a data frame
d<-read.csv("UniversalBank.csv")

# Divide into training and validation partition


set.seed(53)
r<-sample(seq_len(nrow(d)),0.7*nrow(d),replace=F)
dt<-d[r,]
dv<-d[-r,]

# Run Logistic Regression on the training dataset using Income as predictor


and PersonalLoan as outcome variable
nrow(t)
m.lr<-glm(PersonalLoan~Income,data=dt,family="binomial")

# As we are using only one predictor, we can plot and see the relation
between the observed and predicted points

plot(PersonalLoan~Income,data=dt)

lines(dt$Income,m.lr$fitted.values,type="p",col="blue")

# See the details of the model generated


print(summary(m.lr))

# See the anova table with Chi Squared test


anova(m.lr,test="Chisq")

# Use the model on validation data to see the performance


# Type 'response' ensures that the output is in the same scale as the
response variable, and not on logit scale
m.vl<-predict(m.lr,newdata=dv, type="response")

# Draw ROC Curve


library("ROCR")
pred<-prediction(m.vl,dv$PersonalLoan)
perf<-performance(pred,"tpr","fpr")
plot(perf,main="ROC Curve",colorize=T)

# As the ROC Curve shows that high TPR can be achieved at low
# cutoff probabilities, draw confusion matrix accordingly
predcton<-ifelse(m.vl>0.15,1,0)
print("At cutoff=0.15")
print(table(dv$PersonalLoan,predcton))

# Try with even lower cutoff at the cost of FPR


predcton<-ifelse(m.vl>0.1,1,0)
print("At cutoff=0.1")
print(table(dv$PersonalLoan,predcton))

# Draw lift charts


library(lift)
plotLift(m.vl,dv$PersonalLoan)
plotLift(m.vl,dv$PersonalLoan,cumulative = F)

plotLift(predcton,dv$PersonalLoan)
plotLift(predcton,dv$PersonalLoan,cumulative = F)

# Copy the data to csv file for further analysis


dout=data.frame(Outcome=predcton)
dv$Outcome=dout$Outcome
write.csv(dv,"PredUniversalBank.csv",row.names = F)
# Using caret package for KNN
library(caret)

# Read dataset
wine=read.csv(choose.files())
names(wine)

# Use for estimation

# Training and Validation Partition


set.seed(234)
rec=createDataPartition(y=wine$Origin,p=0.7,list=F)
winet=wine[rec,]
winev=wine[-rec,]

# Preprocess. Scaling is necessary if the range of different variables are


different
summary(winet)
pps=preProcess(winet[-1],method = c("scale","center"))
winetpp=predict(pps,winet)

# Check the output


names(winetpp)
head(winetpp)

# Develop the model and check


set.seed(234)
ctrl=trainControl(method="repeatedcv",number = 10,repeats = 3)
mod=train(Origin~.,data=winetpp,trControl=ctrl,method="knn",tuneLength=10)
mod
plot(mod)

# validate the model


winevpp=predict(pps,winev)
pred=predict(mod,winevpp)
pred
mape=sum(abs((pred-winevpp$Origin)/winevpp$Origin))/nrow(winevpp)*100
mape

# Use for Classification

# Convert the output as factor


wine$Origin=factor(wine$Origin)

# Training and Validation Partition


set.seed(234)
rec=createDataPartition(y=wine$Origin,p=0.7,list=F)
winet=wine[rec,]
winev=wine[-rec,]

# Preprocess. Scaling is necessary if the range of different variables are


different
summary(winet)
pps=preProcess(winet[-1],method = c("scale","center"))
winetpp=predict(pps,winet)

# Check the output


names(winetpp)
head(winetpp)

# Develop the model and check


set.seed(234)
ctrl=trainControl(method="repeatedcv",number = 10,repeats = 3)
mod=train(Origin~.,data=winetpp,trControl=ctrl,method="knn",tuneLength=20)
mod
plot(mod)

# validate the model


winevpp=predict(pps,winev)
pred=predict(mod,winevpp)
pred
table(winevpp$Origin,pred)

# Using k-Nearest Neighbor in the Iris dataset to classify in terms of


Species
# Load dataset & library

library(class)
di<-read.table("iris.csv",header=T,sep=",")

# Training and Validation Partitions


set.seed(1020)
part<-sample(1:nrow(di),ceiling(2/3*nrow(di)),replace=F)
trn<-di[part,]
val<-di[-part,]

# Run kNN algorithm


pred<-knn(trn[,-5],val[,-5],trn[,5],k=3)

# Generate Confusion Matrix


print("k=3")
print(table(pred,val[,5]))

# Finetune model with different values of k


pred<-knn(trn[,-5],val[,-5],trn[,5],k=1)
print("k=1")
print(table(pred,val[,5]))

pred<-knn(trn[,-5],val[,-5],trn[,5],k=5)
print("k=5")
print(table(pred,val[,5]))

# Using k-Nearest Neighbor in the Iris dataset to classify in terms of


Species
# Load dataset & library

library(class)
di<-read.table("iris.csv",header=T,sep=",")

# Training and Validation Partitions


set.seed(1020)
part<-sample(1:nrow(di),ceiling(2/3*nrow(di)),replace=F)
trn<-di[part,]
val<-di[-part,]

# Run kNN algorithm


pred<-knn(trn[,-5],val[,-5],trn[,5],k=3)

# Generate Confusion Matrix


print("k=3")
print(table(pred,val[,5]))

# Finetune model with different values of k


pred<-knn(trn[,-5],val[,-5],trn[,5],k=1)
print("k=1")
print(table(pred,val[,5]))

pred<-knn(trn[,-5],val[,-5],trn[,5],k=5)
print("k=5")
print(table(pred,val[,5]))

# Use Iris dataset with Naive Bayes - Caret Package

# Open Dataset
di<-read.csv("iris.csv",header=T)
head(di)

# Call Library
library(caret)

# Partition dataset
rec=createDataPartition(y=di$Species,p=0.7,list=F)
dit=di[rec,]
div=di[-rec,]

# Train the model and explore


mnb=train(Species~.,data=dit,method="nb")
summary(mnb)
names(mnb)
mnb$results

# Use the model to predict on validation dataset and evaluate


outp=predict(mnb,div)
table(div$Species,outp)

# Output the probability values instead of class


outp=predict(mnb,div,type="prob")

# Using Naive Bayes in the Iris dataset to classify in terms of Species


# Load dataset & library
library(klaR)

di<-read.table("iris.csv",header=T,sep=",")

# Training and Validation Partitions


set.seed(1020)
part<-sample(1:nrow(di),ceiling(2/3*nrow(di)),replace=F)
trn<-di[part,]
val<-di[-part,]

# Develop model over training set


m.tr<-NaiveBayes(Species~.,dat=trn)

# Plot the model


plot(m.tr)

# Predict the performance of the model using validation set


m.vl<-predict(m.tr,dat=trn)

# Print the confusion matrix for training set


print(table(trn$Species,m.vl$class))

# Print the confusion matrix for validation set


m.vl<-predict(m.tr,val)
print(table(val$Species,m.vl$class))

# Load a new dataset which does not have the classification


pd<-read.table("irispred.csv",header=T,sep=",")
# PCA using the provided functions
# Load the wine.txt dataframe
wine<-read.csv("wine.csv",header=T)

# Plot to see correlations


# Not including 'Price' in scatterplot matrix, as we are checking
correlations amongst the predictors
# 'Price' is the dependent variable
library(car)
scatterplotMatrix(wine[-4])

# Find the principal Components


wine.pc=prcomp(wine[-4],center=T,scale=T)

# See what are the results of prcomp


print(paste("What does the prcomp result in?"))
print(names(wine.pc))

# See the summary of the PCA


print(paste("Summary of PCA"))
print(summary(wine.pc))

# Get the eigenvalues as square of st dev - i.e., the variance


print(paste("Eigenvalues :"))
print(wine.pc$sdev^2)

# Draw a screeplot to decide how many components to take


screeplot(wine.pc,main="Scree Plot",xlab="Components")
screeplot(wine.pc,type="line", main="Scree Plot")

# Dotplot PC1
library(lattice)

load = wine.pc$rotation
print(paste("The loadings are as follows: "))
print(load)

# order the weights of PC1 per variable


ordered.load=load[order(load[,1]),1]
dotplot(ordered.load,main="Loadings Plot of PC1",xlab="Variable
Loadings",col="red",cex="1.5")

# Dotplot PC2
ordered.load2=load[order(load[,2]),2]
dotplot(ordered.load2,main="Loadings Plot of PC2",xlab="Variable
Loadings",col="red",cex="1.5")

# Draw a biplot
biplot(wine.pc,cex=c(1,0.7))

# Print the final scored data


print(wine.pc$x)

# Cereal dataset - to predict rating

# Use library caret


library(caret)

# load dataframe
cer=read.csv(choose.files(),header=T)

# remove the first column - Cereal - as that is just the name and may not
be important
cer=cer[,-1]

# partition into training and validation partitions


rec=createDataPartition(cer$rating,p=0.7,list=F)
certrain=cer[rec,]
cerval=cer[-rec,]

# create the model and see the outcomes


mlm=train(rating~.,data=certrain,method="lm")
summary(mlm)

# list the importance of variables - in this case by the absolute value of


t-statistics
varImp(mlm)
plot(varImp(mlm))

# predict on the validation set


predrating=predict(mlm,cerval)
mape=sum(abs(cerval$rating-predrating))/nrow(cerval)*100
mape

# This is a script to draw the plot for cooks distance of a regression


model
# and identify those points which have a higher value of cooks distance

# Read in the csv file

d<-read.csv("regtest.csv")

# Partition into training and validation data set


sam<-sample(seq_len(nrow(d)),0.7*nrow(d),replace=F)

dt<-d[sam,]

dv<-d[-sam,]
# Develop the regression model based on the training set
m.lm<-lm(endurance~age, data=dt)

# Compute cooks distance


cd<-cooks.distance(m.lm)

# Identify the data points which have a cooks distance greater than 0.04
tp<-seq(1:length(cd))
ip<-tp[cd>0.04]

iv<-cd[ip]

# Make the final plot with necessary identification


plot(cd)

text(ip,iv-(max(cd)*0.05),names(iv),col="blue",cex=0.7)

# Draw scatterplot matrix using ggplot


library(ggplot2)

# GGally is a package that is based on ggplot2 and enhances certain


features
library(GGally)

# Open iris dataset


di=read.csv(choose.files())

# Draw scatterplot
ggscatmat(di,columns=1:4)

# Draw scatterplot matrix for the numerical values


ggpairs(di, columns=1:4)

# Brush based on Species


ggpairs(di, columns=1:4,aes(color=di$Species))

# Draw scatterplot
ggscatmat(di,columns=1:4)

# Simple preprocessing of data using library caret

# load library
library(caret)

# Open the dataset and investigage


di<-read.csv(choose.files(),header=T)
head(di)
str(di)

# Partition the data


set.seed(34)
rec=createDataPartition(y=di$Species,p=0.7,list=F)
dit=di[rec,]
div=di[-rec,]
# Scale the data
# preprocessing options in caret:
# �center�: subtract mean from values.
# �scale�: divide values by standard deviation.
# �range�: normalize values.
# In case of missing values, one can use knn to impute "knnImpute"
pps=preProcess(dit,method=c("scale","center"))
train=predict(pps,dit)
val=predict(pps,div)

# One can also create dummy variables using caret


# Try to create dummy for "Species"
dmy=dummyVars("~.",dn,fullRank = T)

# fullRank True will create n-1 dymmys, while False will create n dummies
dnnew=data.frame(predict(dmy,newdata=di))

# the outcome is a matrix, hence it needs to be coarced to data frame

# Example for Feature Selection

# Read file - Cement.txt


d<-read.table(choose.files(),header=T)
names(d)

# Using library caret


library(caret)

# Use Recursive Feature Elimination (Backward Selection)


set.seed(10)
ctrl=rfeControl(functions=lmFuncs,method="cv",verbose=F)
lmfinal=rfe(d[,-11],d[,11],sizes=c(5:10),rfeControl = ctrl)
lmfinal

# Plot the result to understand the selection


plot(lmfinal)
plot(lmfinal, metric = "MAE")

# Using library leaps


library(leaps)

# method can be "Cp","adjr2", or "r2"


v=leaps(d[,-1],d[,1],method = "Cp")

# Use regsubsets (method may be exhaustive, forward, backward,seqrep)


v1=regsubsets(y~.,d,nbest=2,nvmax=12,method="exhaustive")
n=summary(v1)
n

# Check the result


n$which
n$adjr2
n$cp
n$r2

# Step Wise Selection using caret


cpm=train(y~.,d,method="glmStepAIC")
# use the previous model to predict the class
print(predict(m.tr,pd))

# cSplit(df, 1:ncol(df), sep=",", stripWhite=TRUE, type.convert=FALSE)

# Code to remove rows based on row names

# Load Cereals Dataset


d=read.csv(choose.files())

# Create data partitions


library(caret)
set.seed(55)
rec=createDataPartition(y=d$rating,p=0.7,list=F)
dt=d[rec,]
dv=d[-rec,]

# We want to remove records with rownames of 5 and 7


head(dt)
rowtoremove=c(5,7)
dtn=dt[!row.names(dt) %in% rowtoremove,]

# Check if those two records have been removed


head(dtn)

# Using k-Nearest Neighbor in the Iris dataset to classify in terms of


Species
# Load dataset & library

library(class)
di<-read.table("iris.csv",header=T,sep=",")

# Training and Validation Partitions


set.seed(1020)
part<-sample(1:nrow(di),ceiling(2/3*nrow(di)),replace=F)
trn<-di[part,]
val<-di[-part,]

# Run kNN algorithm


pred<-knn(trn[,-5],val[,-5],trn[,5],k=3)

# Generate Confusion Matrix


print("k=3")
print(table(pred,val[,5]))

# Finetune model with different values of k


pred<-knn(trn[,-5],val[,-5],trn[,5],k=1)
print("k=1")
print(table(pred,val[,5]))

pred<-knn(trn[,-5],val[,-5],trn[,5],k=5)
print("k=5")
print(table(pred,val[,5]))

# Using caret package for KNN


library(caret)

# Read dataset
wine=read.csv(choose.files())
names(wine)

# Use for estimation

# Training and Validation Partition


set.seed(234)
rec=createDataPartition(y=wine$Origin,p=0.7,list=F)
winet=wine[rec,]
winev=wine[-rec,]

# Preprocess. Scaling is necessary if the range of different variables are


different
summary(winet)
pps=preProcess(winet[-1],method = c("scale","center"))
winetpp=predict(pps,winet)

# Check the output


names(winetpp)
head(winetpp)

# Develop the model and check


set.seed(234)
ctrl=trainControl(method="repeatedcv",number = 10,repeats = 3)
mod=train(Origin~.,data=winetpp,trControl=ctrl,method="knn",tuneLength=10)
mod
plot(mod)

# validate the model


winevpp=predict(pps,winev)
pred=predict(mod,winevpp)
pred
mape=sum(abs((pred-winevpp$Origin)/winevpp$Origin))/nrow(winevpp)*100
mape

# Use for Classification

# Convert the output as factor


wine$Origin=factor(wine$Origin)

# Training and Validation Partition


set.seed(234)
rec=createDataPartition(y=wine$Origin,p=0.7,list=F)
winet=wine[rec,]
winev=wine[-rec,]

# Preprocess. Scaling is necessary if the range of different variables are


different
summary(winet)
pps=preProcess(winet[-1],method = c("scale","center"))
winetpp=predict(pps,winet)

# Check the output


names(winetpp)
head(winetpp)

# Develop the model and check


set.seed(234)
ctrl=trainControl(method="repeatedcv",number = 10,repeats = 3)
mod=train(Origin~.,data=winetpp,trControl=ctrl,method="knn",tuneLength=20)
mod
plot(mod)

# validate the model


winevpp=predict(pps,winev)
pred=predict(mod,winevpp)
pred
table(winevpp$Origin,pred)

S-ar putea să vă placă și