Documente Academic
Documente Profesional
Documente Cultură
Ans: RESPONSE
3) Seed and split used by our group according to the convention informed earlier
Ans: Seed= (18086+18090+18095)/3 =18090 `
Split= 70: 30
Significant Variables: All the below variables are the significant variables
1|Page
Figure 1: ROC Plot for Logistic Regression after removal of insignificant
variables
2|Page
3|Page
Figure 3: Classification Tree
4|Page
DURATION
AMOUNT
INSTALL_RATE 1 1
7 3 768 1959886713 1 6 7
DURATION.1
HISTORY1
HISTORY2
HISTORY3
HISTORY4
NEW_CAR1
1 85575225
USED_CAR1
FURNITURE1
RADIO.TV1 8.4
EDUCATION1 12 82
RETRAINING1 -7 ..2955
49
-1 0.4
AMOUNT.1 3.353 1 9
-1 .71
SAV_ACCT1 -1 756 8 856
6 .0
8648
SAV_ACCT2 . 457077
579
SAV_ACCT3 3 . 87963 3
62 .6 11
SAV_ACCT4 16 .77 64 25 8
EMPLOYMENT1 6.9981 0
34
EMPLOYMENT2 16.502 94 3.45324 RESPONSE1
0 0782 8
EMPLOYMENT3 25 .65
3 1 .25 162 151
EMPLOYMENT4 6. 0 7
INSTALL_RATE.1 --116.2.903 11856
1 2
MALE_DIV1 1 .0 765
1 7
MALE_SINGLE1 122.1 59 1
MALE_MAR_or_WID1-4 45280
CO.APPLICANT1 3.28.0
8 480372
-1
9 211194047 4 3
GUARANTOR.1
7194472 041846161161 10 11947.14.3382
PRESENT_RESIDENT2
59559667 85.75059859.3 .66 . .7 3 4 .
752. .0-8 .6.7-4 0. 3. 7. -4 -1 1 2 2 - 0
PRESENT_RESIDENT3
PRESENT_RESIDENT4
REAL_ESTATE1
PROP_UNKN_NONE.1
AGE.1
OTHER_INSTALL1
RENT1
OWN_RES1
NUM_CREDITS.1
JOB1
JOB2
JOB3
NUM_DEPENDENTS.1
TELEPHONE1
FOREIGN1
5|Page
5) AUC obtained from the two classifiers
Ans:
AUC
6) Which classifier gives the best model? Note down the significant variables from this
model. Your model must fulfil the assumptions required for developing that model.
Ans: Logistic Regression gives the best model in terms of Accuracy and auc graph.
Significant Variables:
6|Page
7) If you wish to find all potential defaulters, how much minimum records you need to sift
through based on your model.
Ans: 90
8) A customer approaches the bank for credit. His details are as follows:
Checking Account > 200 DM;
History: Delay in Paying Off;
Savings Account: Greater than 1000 DM;
Purpose of Credit: New Car;
Amount: 1000;
Employment: 4-7 Years;
Instalment Rate: 3;
Marital Status: Male Married;
Co-Applicant: Applicant has a guarantor;
Present Residence: 2: 2-3 years;
Real Estate: Applicant owns no property;
Age: 35;
Other Instalments: No;
Residence: No;
Number of Credits: 2;
Job: Skilled Employee;
Number of Dependents: 2;
Telephone: Owns a phone;
Foreign: No.
Should the bank give him loan or not
Ans: Since duration is not given in the question, we are removing the DURATION from
the model and again rerunning the model. So, now the significant values defined in Q5
and Q6 may change.
7|Page
Logit(y)= 0.8926989+ 0.4221860*CHK_ACCT1 + 0.9152301*CHK_ACCT2 + 1.8448606*CHK_ACCT3+
0.2533980*HISTORY1 + 0.9779288*HISTORY2 + 1.2332237*HISTORY3 + 1.7567161 *HISTORY4 - 0.9116212
NEW_CAR1 +1.2407763*USED_CAR1 -1.1092321 *EDUCATION1 -0.0002041 * AMOUNT + 0.2703463
SAV_ACCT1 +0.724567*SAV_ACCT2 + 1.3971794*SAV_ACCT3+ 0.6038728*SAV_ACCT4 - 0.2105577*
EMPLOYMENT1- 0.0862687*EMPLOYMENT2 + 0.5013060 *EMPLOYMENT3 +0.1309950* EMPLOYMENT4 -
0.3648801*INSTALL_RATE+ 0.4831641 *MALE_SINGLE1 + 1.0396041*GUARANTOR - 0.6856990*
PRESENT_RESIDENT2 -0.4517887*PRESENT_RESIDENT3-0.3590037* PRESENT_RESIDENT4 -0.5621355 *
OTHER_INSTALL1 + 1.5084766* FOREIGN1
=4.009
8|Page
CODE:
#Unbalanced dataset
install.packages("caTools")
install.packages("ROCR")
install.packages("gains")
install.packages("rpart")
install.packages("rpart.plot")
install.packages("randomForest")
install.packages("caret")
install.packages("ggplot2")
install.packages("lattice")
install.packages("e1071")
install.packages("neuralnet")
install.packages("grid")
install.packages("MASS")
install.packages("nnet")
install.packages("caret")
getwd()
data$CHK_ACCT = as.factor(data$CHK_ACCT)
data$DURATION=as.numeric(data$DURATION)
data$HISTORY = as.factor(data$HISTORY)
data$SAV_ACCT = as.factor(data$SAV_ACCT)
data$NEW_CAR=as.factor(data$NEW_CAR)
data$USED_CAR=as.factor(data$USED_CAR)
data$FURNITURE=as.factor(data$FURNITURE)
data$RADIO.TV=as.factor(data$RADIO.TV)
data$EDUCATION=as.factor(data$EDUCATION)
data$RETRAINING=as.factor(data$RETRAINING)
data$AMOUNT=as.numeric(data$AMOUNT)
data$INSTALL_RATE=as.numeric(data$INSTALL_RATE)
data$AGE=as.numeric(data$AGE)
data$NUM_CREDITS=as.numeric(data$NUM_CREDITS)
data$NUM_DEPENDENTS=as.numeric(data$NUM_DEPENDENTS)
data$EMPLOYMENT=as.factor(data$EMPLOYMENT)
9|Page
data$MALE_DIV=as.factor(data$MALE_DIV)
data$MALE_SINGLE=as.factor(data$MALE_SINGLE)
data$MALE_MAR_or_WID=as.factor(data$MALE_MAR_or_WID)
data$CO.APPLICANT=as.factor(data$CO.APPLICANT)
data$PRESENT_RESIDENT=as.factor(data$PRESENT_RESIDENT)
data$REAL_ESTATE=as.factor(data$REAL_ESTATE)
data$OTHER_INSTALL=as.factor(data$OTHER_INSTALL)
data$JOB=as.factor(data$JOB)
data$TELEPHONE=as.factor(data$TELEPHONE)
data$OWN_RES=as.factor(data$OWN_RES)
data$RENT=as.factor(data$RENT)
data$FOREIGN=as.factor(data$FOREIGN)
data$RESPONSE=as.factor(data$RESPONSE)
str(data)
#2.Exploratory Analysis
#Check for potential Multicollinear variables (Those having high correlations)
cor(data[sapply(data, function(x) !is.factor(x))]) # Amount and Duration have 0.6
correlation,but didnt remove them
library(caTools)
set.seed(18090)
split<- sample.split(data$RESPONSE,SplitRatio=0.7)
train= subset(data,split==TRUE)
valid= subset(data,split==FALSE)
str(train)
str(valid)
table(train$RESPONSE)
help(table)
baseline.train = 490/(490+210) #70.00%
#4. Run Logistic Regression on Training Data and make Predictions on Validataion
Dataset
#use glm() (general linear model) with family = "binomial" to fit a logistic regression.
10 | P a g e
train.log <- glm(RESPONSE ~., data = train, family = "binomial")
options(scipen = 999)
summary(train.log)
train.log$fitted.values
help(predict)
predictTrain = predict(train.log, type="response", data=train)
predictTrain
table(train$RESPONSE, predictTrain>0.5)
(445+125)/(45+125+445+85) #81.42
library(ROCR)
ROCRpred = prediction(predictValid, valid$RESPONSE)
str(ROCRpred)
ROCRperf = performance(ROCRpred, "tpr","fpr")
help(performance)
11 | P a g e
(47+183)/(47+183+27+43) #76.67
library(ROCR)
ROCRpred = prediction(predictValid, valid$RESPONSE)
ROCRperf = performance(ROCRpred, "tpr","fpr")
plot(ROCRperf, col="black", lty=2, lwd=1)
plot(ROCRperf, col=rainbow(7))
lines(c(0,sum(as.numeric(valid$RESPONSE)))~c(0, dim(valid)[1]), lty=2)
auc = performance(ROCRpred, "auc")@y.values #81.23%
auc
Youden = (ROCRpred@tp[[1]]/(ROCRpred@tp[[1]]+ROCRpred@fn[[1]]) +
ROCRpred@tn[[1]]/(ROCRpred@tn[[1]]+ROCRpred@fp[[1]]) - 1)
max(Youden) #0.461
a = ROCRpred@tp[[1]]/(ROCRpred@tp[[1]]+ROCRpred@fn[[1]])
b = ROCRpred@tn[[1]]/(ROCRpred@tn[[1]]+ROCRpred@fp[[1]])
c = Youden
z = as.data.frame(cbind(a,b,c))
z
max(c)
plot (Youden)
str(ROCRperf)
ROCRperf@alpha.values
library(gains)
gain = gains(as.integer(valid$RESPONSE), predictValid, groups=length(predictValid))
12 | P a g e
#Plot lift Chart
plot(c(0,gain$cume.pct.of.total*sum(as.numeric(valid$RESPONSE)))~c(0,gain$cume
.obs),
xlab="# cases", ylab="Cumulative", main="", type="l")
lines(c(0,sum(as.numeric(valid$RESPONSE)))~c(0, dim(valid)[1]), lty=2)
#Classification Modeling
-------------------------
#7. Run Classification Trees on Training Data and make Predictions on Validataion
Dataset
library(rpart)
library(rpart.plot)
library(ROCR)
predictROC = predict(train.y.ct, newdata = valid)
predictROC
pred = prediction(predictROC[,2], valid$RESPONSE)
perf = performance(pred, "tpr", "fpr")
perf
plot(perf, col=rainbow(10), lty=2)
auc=performance(pred, "auc")@y.values #74.42
auc
13 | P a g e
table(valid$RESPONSE, predictValidForest)
(38+191)/(38+191+19+52) #76.33%
#10. NeuralNetwork
data<- read.csv("10-GermanCredit.csv")
str(data)
summary(data)
data$CHK_ACCT = as.factor(data$CHK_ACCT)
data$DURATION=as.numeric(data$DURATION)
data$HISTORY = as.factor(data$HISTORY)
data$SAV_ACCT = as.factor(data$SAV_ACCT)
data$NEW_CAR=as.factor(data$NEW_CAR)
data$USED_CAR=as.factor(data$USED_CAR)
data$FURNITURE=as.factor(data$FURNITURE)
data$RADIO.TV=as.factor(data$RADIO.TV)
data$EDUCATION=as.factor(data$EDUCATION)
data$RETRAINING=as.factor(data$RETRAINING)
data$AMOUNT=as.numeric(data$AMOUNT)
data$INSTALL_RATE=as.numeric(data$INSTALL_RATE)
data$AGE=as.numeric(data$AGE)
data$NUM_CREDITS=as.numeric(data$NUM_CREDITS)
data$NUM_DEPENDENTS=as.numeric(data$NUM_DEPENDENTS)
data$EMPLOYMENT=as.factor(data$EMPLOYMENT)
data$MALE_DIV=as.factor(data$MALE_DIV)
14 | P a g e
data$MALE_SINGLE=as.factor(data$MALE_SINGLE)
data$MALE_MAR_or_WID=as.factor(data$MALE_MAR_or_WID)
data$CO.APPLICANT=as.factor(data$CO.APPLICANT)
data$PRESENT_RESIDENT=as.factor(data$PRESENT_RESIDENT)
data$REAL_ESTATE=as.factor(data$REAL_ESTATE)
data$OTHER_INSTALL=as.factor(data$OTHER_INSTALL)
data$JOB=as.factor(data$JOB)
data$TELEPHONE=as.factor(data$TELEPHONE)
data$OWN_RES=as.factor(data$OWN_RES)
data$RENT=as.factor(data$RENT)
data$FOREIGN=as.factor(data$FOREIGN)
data$RESPONSE=as.factor(data$RESPONSE)
str(data)
n = names(train)
f = as.formula(paste("RESPONSE1 ~", paste(n[!n %in% "RESPONSE1"], collapse = "
+ ")))
15 | P a g e
train.nn = neuralnet(f, data=train, hidden=1, threshold = 0.01, linear.output=FALSE)
#linear.output = False for classication and true for regression
summary(train.nn)
train.nn$result.matrix #summary of results: matrix containing the error, reached
threshold, needed steps, AIC and BIC and estimated weights for each replication.
plot(train.nn)
help(neuralnet)
predict.train.nn = compute(train.nn, train)
table(train$RESPONSE, predict.train.nn$net.result>0.50)
(465+126)/(465+126+25+84) #84.42
16 | P a g e