Sunteți pe pagina 1din 17

Prediction of Diamond Price: 2 Step Method

Sarajit Poddar
31 July 2015

Contents
1 Objective

2 Algorithm development & Testing

2.1

Initial setup . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .

2.2

Exploring the data . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .

2.3

Model development . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .

2.4

Developing Predictive Models (Decision Tree) . . . . . . . . . . . . . . . . . . . . . . . . . . .

2.5

Developing Predictive Models (Randomforest) . . . . . . . . . . . . . . . . . . . . . . . . . . .

2.6

Predicting the price in the Test dataset . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .

11

2.7

Analysing the Residuals . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .

13

Objective

The objective of this article is to explore machine learning algorithm for classification of diamonds into various
cost buckets depending on various characteristics.

2
2.1
2.1.1

Algorithm development & Testing


Initial setup
Load libraries

Data cleansing, tidying, transformation libraries. Plotting libraries.


# Load required libraries
library(dplyr); library(tidyr); library(ggplot2)
Specialised libraries for machine learning
# Load required libraries
library(caret); library(randomForest)
library(rattle); library(rpart.plot)
# set see so that the results can be reproducible
set.seed(1000)

2.1.2

Subsetting the dataset

Subsetting step 1: Price Range


# Load the diamonds dataset
data(diamonds)
# Price range
price.low <- 1000
price.high <- 5000
# Subsetting sampling the data based on the price range
data.sample <- subset(diamonds, price >= price.low
&
price <= price.high)
Subsetting step 2: Number of observations
# Number of observations
input.obs <- 5000
# Sampling the data from the subset based on the number of observations
data.sample <- data.sample[sample(1:nrow(data.sample), input.obs, replace=FALSE),]
# Assigning the value of dataset variable
dataset <- data.sample

2.1.3

Cutting the diamond price and carats into ordinal factors

# Diamond price: Cut by interval of 1000


# fprice = factored price
dataset$fprice <- as.numeric(cut(dataset$price,
seq(from = 0, to = 10000, by = 500)))
# Convert factored price into ordinal factor
dataset$fprice <- ordered(dataset$fprice)
# Diamond Carat: Cut by interval 0.5
# fcarat = factored carat
dataset$fcarat <- as.numeric(cut(dataset$carat,
seq(from = 0, to = 10, by = 0.1)))
# Convert factored carat into ordinal factor
dataset$fcarat <- ordered(dataset$fcarat)
# Remove records with NA
dataset<- na.omit(dataset)

2.1.4

Structure of dataset

str(dataset)
## 'data.frame':
5000 obs. of 12 variables:
## $ carat : num 1.02 0.57 0.71 0.51 0.41 0.79 1 0.52 0.9 1.19 ...
## $ cut
: Ord.factor w/ 5 levels "Fair"<"Good"<..: 3 5 3 5 5 5 1 5 3 4 ...
## $ color : Ord.factor w/ 7 levels "D"<"E"<"F"<"G"<..: 5 1 2 1 2 3 6 6 3 1 ...
2

##
##
##
##
##
##
##
##
##

$
$
$
$
$
$
$
$
$

clarity:
depth :
table :
price :
x
:
y
:
z
:
fprice :
fcarat :

Ord.factor w/ 8 levels "I1"<"SI2"<"SI1"<..: 3 3 6 4 5 3 1 6 3 2 ...


num 58.4 61 63.4 62.2 62.1 62 68.6 62.2 61.7 58.6 ...
num 62 57 58 58 56 55 56 57 60 61 ...
int 4518 1847 3340 1678 1153 3081 1805 1316 3950 4168 ...
num 6.55 5.34 5.64 5.1 4.78 5.93 6.2 5.18 6.16 6.93 ...
num 6.61 5.31 5.69 5.12 4.75 5.95 6.08 5.11 6.22 6.89 ...
num 3.84 3.25 3.59 3.18 2.96 3.68 4.22 3.2 3.82 4.05 ...
Ord.factor w/ 9 levels "2"<"3"<"4"<"5"<..: 9 3 6 3 2 6 3 2 7 8 ...
Ord.factor w/ 14 levels "3"<"4"<"5"<"6"<..: 9 4 6 4 3 6 8 4 7 10 ...

summary(dataset)
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##

carat
Min.
:0.2500
1st Qu.:0.5200
Median :0.7000
Mean
:0.7127
3rd Qu.:0.9000
Max.
:1.5100

cut
Fair
: 213
Good
: 518
Very Good:1081
Premium :1209
Ideal
:1979

table
Min.
:51.00
1st Qu.:56.00
Median :57.00
Mean
:57.57
3rd Qu.:59.00
Max.
:76.00

price
Min.
:1000
1st Qu.:1624
Median :2391
Mean
:2635
3rd Qu.:3684
Max.
:5000

color
clarity
depth
D: 704
SI1
:1319
Min.
:53.10
E:1004
SI2
:1028
1st Qu.:61.00
F: 994
VS2
: 992
Median :61.80
G: 953
VS1
: 629
Mean
:61.78
H: 685
VVS2
: 417
3rd Qu.:62.60
I: 418
VVS1
: 349
Max.
:79.00
J: 242
(Other): 266
x
y
Min.
:0.000
Min.
:4.230
1st Qu.:5.180
1st Qu.:5.190
Median :5.690
Median :5.700
Mean
:5.661
Mean
:5.662
3rd Qu.:6.150
3rd Qu.:6.150
Max.
:7.380
Max.
:7.200

z
Min.
:0.000
1st Qu.:3.200
Median :3.520
Mean
:3.496
3rd Qu.:3.830
Max.
:4.820

fprice
3
:1105
4
: 855
5
: 656
6
: 547
9
: 512
10
: 463
(Other): 862

fcarat
6
:1127
8
: 846
5
: 578
11
: 548
7
: 517
9
: 418
(Other): 966

head(dataset, 5)
##
##
##
##
##
##
##
##
##
##
##
##

carat
cut color clarity depth table price
x
y
z
1.02 Very Good
H
SI1 58.4
62 4518 6.55 6.61 3.84
0.57
Ideal
D
SI1 61.0
57 1847 5.34 5.31 3.25
0.71 Very Good
E
VVS2 63.4
58 3340 5.64 5.69 3.59
0.51
Ideal
D
VS2 62.2
58 1678 5.10 5.12 3.18
0.41
Ideal
E
VS1 62.1
56 1153 4.78 4.75 2.96
fprice fcarat
9068
10
11
47377
4
6
3178
7
8
45482
4
6
40631
3
5
9068
47377
3178
45482
40631

2.2

Exploring the data

2.2.1

Price distribution in the dataset

# Histogram of price distribution


qplot(fprice, data=dataset, geom="histogram")

count

900

600

300

0
2

fprice
# Histogram of carat distribution
qplot(fcarat, data=dataset, geom="histogram")

10

count

900

600

300

0
3

10

11

12

13

14

15

16

fcarat
#
g
g
g
g
g

Association of price with carat and clarity


<- ggplot(dataset, aes(y = fprice, x = fcarat))
<- g + geom_point(aes(color=clarity), position="jitter")
<- g + geom_smooth(method=loess, col="blue", lwd=1)
<- g + theme(legend.position="bottom")

## geom_smooth: Only one unique x value each group.Maybe you want aes(group = 1)?

10
9
8

fprice

7
6
5
4
3
2
3

10

11

12

13

14

15

fcarat
clarity

2.3

I1

SI2

SI1

VS2

VS1

VVS2

VVS1

Model development

# Tidy up the dataset used for development of the model


dataset.pr <- select(dataset, fcarat, cut:table, x:z, fprice, price)
dataset
<- select(dataset, fcarat, cut:table, x:z, fprice)
# Split the data into training and testing datasets
# 70% in the training dataset and 30% in testing dataset
inTrain <- createDataPartition(y=dataset$fprice, p=0.7, list=FALSE)
training <- dataset[inTrain,]
testing <- dataset[-inTrain,]
dim(training); dim(testing)
2.3.0.1

Splitting data into Training and Testing

## [1] 3504

10

## [1] 1496

10

2.4
2.4.1

Developing Predictive Models (Decision Tree)


Model definition

IF

16

modFit <- train(fprice ~., method = "rpart", data = training)

2.4.2

Plotting the classification tree, the fancy style

library(rattle); library(rpart.plot)
fancyRpartPlot(modFit$finalModel)
1

3
.00 .22 .17 .13 .11
.09 .08 .10 .09
100%
yes

x < 5.5

no
3

9
.00 .01 .05 .16 .17
.15 .14 .17 .16
59%
y < 6.1

3
.00 .52 .34 .10 .02
.01 .00 .00 .00
41%

6
.00 .01 .09 .29 .30
.19 .08 .03 .01
31%

9
.00 .00 .00 .01 .03
.11 .20 .33 .32
28%

Rattle 2015Aug01 00:54:43 sarajitp


2.4.3

Model validation

pred.train <- predict(modFit, training)


print(confusionMatrix(pred.train, training$fprice))
2.4.3.1

Training set accuracy (In-Sample)

## Confusion Matrix and Statistics


##
##
Reference
## Prediction
2
3
4
5
6
##
2
0
0
0
0
0
##
3
5 763 500 141 31
##
4
0
0
0
0
0

7
0
9
0

8
0
3
0

9
0
1
0
7

10
0
1
0

##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##

5
6
7
8
9
10

0
0
0
0
0
0

0
11
0
0
0
0

0
0
0
0
0
0
0
97 306 323 200 87 34 14
0
0
0
0
0
0
0
0
0
0
0
0
0
0
2 13 29 104 196 324 310
0
0
0
0
0
0
0

Overall Statistics
Accuracy
95% CI
No Information Rate
P-Value [Acc > NIR]

:
:
:
:

0.4024
(0.3861, 0.4189)
0.2209
< 2.2e-16

Kappa : 0.2939
Mcnemar's Test P-Value : NA
Statistics by Class:
Class: 2 Class: 3 Class: 4 Class: 5 Class: 6 Class: 7
0.000000
0.9858
0.0000
0.0000 0.84334 0.00000
1.000000
0.7469
1.0000
1.0000 0.76001 1.00000
NaN
0.5248
NaN
NaN 0.30131
NaN
0.998573
0.9946
0.8291
0.8687 0.97533 0.91067
0.001427
0.2209
0.1709
0.1313 0.10930 0.08933
0.000000
0.2178
0.0000
0.0000 0.09218 0.00000
0.000000
0.4150
0.0000
0.0000 0.30594 0.00000
0.500000
0.8663
0.5000
0.5000 0.80168 0.50000
Class: 8 Class: 9 Class: 10
Sensitivity
0.00000 0.90251
0.00000
Specificity
1.00000 0.79205
1.00000
Pos Pred Value
NaN 0.33129
NaN
Neg Pred Value
0.91838 0.98614
0.90725
Prevalence
0.08162 0.10245
0.09275
Detection Rate
0.00000 0.09247
0.00000
Detection Prevalence 0.00000 0.27911
0.00000
Balanced Accuracy
0.50000 0.84728
0.50000
Sensitivity
Specificity
Pos Pred Value
Neg Pred Value
Prevalence
Detection Rate
Detection Prevalence
Balanced Accuracy

pred.test <- predict(modFit, testing)


print(confusionMatrix(pred.test, testing$fprice))
2.4.3.2

Validation set accuracy (Out-of-Sample)

## Confusion Matrix and Statistics


##
##
Reference
## Prediction
2
3
4
5
6
##
2
0
0
0
0
0
##
3
2 323 215 66 15
##
4
0
0
0
0
0
##
5
0
0
0
0
0

7
0
8
0
0

8
0
0
0
0

9
0
1
0
0
8

10
0
0
0
0

##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##

6
7
8
9
10

0
0
0
0
0

8
0
0
0
0

41 125 134
0
0
0
0
0
0
0
5 15
0
0
0

80
0
0
46
0

28 12
6
0
0
0
0
0
0
94 140 132
0
0
0

Overall Statistics
Accuracy
95% CI
No Information Rate
P-Value [Acc > NIR]

:
:
:
:

0.3991
(0.3741, 0.4244)
0.2213
< 2.2e-16

Kappa : 0.2892
Mcnemar's Test P-Value : NA
Statistics by Class:
Class: 2 Class: 3 Class: 4 Class: 5 Class: 6 Class: 7
0.000000
0.9758
0.0000
0.000 0.81707 0.00000
1.000000
0.7365
1.0000
1.000 0.77477 1.00000
NaN
0.5127
NaN
NaN 0.30876
NaN
0.998663
0.9908
0.8289
0.869 0.97175 0.91043
0.001337
0.2213
0.1711
0.131 0.10963 0.08957
0.000000
0.2159
0.0000
0.000 0.08957 0.00000
0.000000
0.4211
0.0000
0.000 0.29011 0.00000
0.500000
0.8562
0.5000
0.500 0.79592 0.50000
Class: 8 Class: 9 Class: 10
Sensitivity
0.00000 0.91503
0.00000
Specificity
1.00000 0.78258
1.00000
Pos Pred Value
NaN 0.32407
NaN
Neg Pred Value
0.91845 0.98778
0.90775
Prevalence
0.08155 0.10227
0.09225
Detection Rate
0.00000 0.09358
0.00000
Detection Prevalence 0.00000 0.28877
0.00000
Balanced Accuracy
0.50000 0.84880
0.50000
Sensitivity
Specificity
Pos Pred Value
Neg Pred Value
Prevalence
Detection Rate
Detection Prevalence
Balanced Accuracy

2.5
2.5.1

Developing Predictive Models (Randomforest)


Model definition

modFit <- randomForest(fprice ~. , data=training,


importance = TRUE, ntrees = 10)

2.5.2

Model validation

pred.train <- predict(modFit, training)


print(confusionMatrix(pred.train, training$fprice))

2.5.2.1
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##

Training set accuracy (In-Sample)

Confusion Matrix and Statistics


Reference
Prediction
2
3
4
5
6
7
8
9 10
2
5
0
0
0
0
0
0
0
0
3
0 774
1
0
0
0
0
0
0
4
0
0 598
0
0
0
0
0
0
5
0
0
0 460
0
0
0
0
0
6
0
0
0
0 382
0
0
0
0
7
0
0
0
0
1 313
1
0
0
8
0
0
0
0
0
0 285
0
0
9
0
0
0
0
0
0
0 359
0
10
0
0
0
0
0
0
0
0 325
Overall Statistics
Accuracy
95% CI
No Information Rate
P-Value [Acc > NIR]

:
:
:
:

0.9991
(0.9975, 0.9998)
0.2209
< 2.2e-16

Kappa : 0.999
Mcnemar's Test P-Value : NA
Statistics by Class:
Class: 2 Class: 3 Class: 4 Class: 5 Class: 6 Class: 7
1.000000
1.0000
0.9983
1.0000
0.9974 1.00000
1.000000
0.9996
1.0000
1.0000
1.0000 0.99937
1.000000
0.9987
1.0000
1.0000
1.0000 0.99365
1.000000
1.0000
0.9997
1.0000
0.9997 1.00000
0.001427
0.2209
0.1709
0.1313
0.1093 0.08933
0.001427
0.2209
0.1707
0.1313
0.1090 0.08933
0.001427
0.2212
0.1707
0.1313
0.1090 0.08990
1.000000
0.9998
0.9992
1.0000
0.9987 0.99969
Class: 8 Class: 9 Class: 10
Sensitivity
0.99650
1.0000
1.00000
Specificity
1.00000
1.0000
1.00000
Pos Pred Value
1.00000
1.0000
1.00000
Neg Pred Value
0.99969
1.0000
1.00000
Prevalence
0.08162
0.1025
0.09275
Detection Rate
0.08134
0.1025
0.09275
Detection Prevalence 0.08134
0.1025
0.09275
Balanced Accuracy
0.99825
1.0000
1.00000
Sensitivity
Specificity
Pos Pred Value
Neg Pred Value
Prevalence
Detection Rate
Detection Prevalence
Balanced Accuracy

pred.test <- predict(modFit, testing)


print(confusionMatrix(pred.test, testing$fprice))
2.5.2.2

Validation set accuracy (Out-of-Sample)


10

##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##

Confusion Matrix and Statistics


Reference
Prediction
2
3
4
5
2
0
0
0
0
3
2 311 20
1
4
0 16 208 35
5
0
4 26 120
6
0
0
2 34
7
0
0
0
3
8
0
0
0
3
9
0
0
0
0
10
0
0
0
0

6
0
0
0
40
85
34
4
0
1

7
0
0
0
7
36
57
22
10
2

8
0
0
0
0
6
19
46
45
6

9
0
0
0
1
1
8
15
80
48

10
0
0
0
0
1
0
7
42
88

Overall Statistics
Accuracy
95% CI
No Information Rate
P-Value [Acc > NIR]

:
:
:
:

0.6651
(0.6406, 0.689)
0.2213
< 2.2e-16

Kappa : 0.6097
Mcnemar's Test P-Value : NA
Statistics by Class:
Class: 2 Class: 3 Class: 4 Class: 5 Class: 6 Class: 7
0.000000
0.9396
0.8125 0.61224 0.51829 0.42537
1.000000
0.9803
0.9589 0.94000 0.93994 0.95301
NaN
0.9311
0.8031 0.60606 0.51515 0.47107
0.998663
0.9828
0.9612 0.94145 0.94065 0.94400
0.001337
0.2213
0.1711 0.13102 0.10963 0.08957
0.000000
0.2079
0.1390 0.08021 0.05682 0.03810
0.000000
0.2233
0.1731 0.13235 0.11029 0.08088
0.500000
0.9599
0.8857 0.77612 0.72912 0.68919
Class: 8 Class: 9 Class: 10
Sensitivity
0.37705 0.52288
0.63768
Specificity
0.96288 0.92777
0.95803
Pos Pred Value
0.47423 0.45198
0.60690
Neg Pred Value
0.94568 0.94466
0.96299
Prevalence
0.08155 0.10227
0.09225
Detection Rate
0.03075 0.05348
0.05882
Detection Prevalence 0.06484 0.11832
0.09693
Balanced Accuracy
0.66997 0.72532
0.79785
Sensitivity
Specificity
Pos Pred Value
Neg Pred Value
Prevalence
Detection Rate
Detection Prevalence
Balanced Accuracy

2.6
2.6.1

Predicting the price in the Test dataset


Determine the fitted model based using price range as one of the predictor

# Fitted model
fitted.model <- lm(price ~ fcarat + cut + clarity + color + table + y + z + fprice,
data = dataset.pr)

11

# Summary of the fitted model


summary(fitted.model)
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##

Call:
lm(formula = price ~ fcarat + cut + clarity + color + table +
y + z + fprice, data = dataset.pr)
Residuals:
Min
1Q
-478.00 -91.32

Median
-0.86

3Q
90.29

Max
653.91

Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 692.7184
121.8809
5.684 1.39e-08 ***
fcarat.L
63.6436
83.7364
0.760 0.447263
fcarat.Q
-207.6135
64.4114 -3.223 0.001276 **
fcarat.C
-59.1854
59.6963 -0.991 0.321518
fcarat^4
22.1916
53.7768
0.413 0.679872
fcarat^5
-35.8988
45.3522 -0.792 0.428657
fcarat^6
-2.3563
36.5826 -0.064 0.948646
fcarat^7
-25.9610
31.6190 -0.821 0.411654
fcarat^8
-1.1780
29.5570 -0.040 0.968211
fcarat^9
-13.4611
25.6953 -0.524 0.600390
fcarat^10
-15.2370
19.4604 -0.783 0.433680
fcarat^11
-29.6520
13.2595 -2.236 0.025378 *
fcarat^12
-17.7874
8.5098 -2.090 0.036649 *
fcarat^13
-10.1020
6.2293 -1.622 0.104931
cut.L
24.6540
7.0539
3.495 0.000478 ***
cut.Q
2.7526
5.7337
0.480 0.631200
cut.C
5.7071
5.1496
1.108 0.267803
cut^4
-5.7487
4.3535 -1.320 0.186742
clarity.L
370.0914
14.4251 25.656 < 2e-16 ***
clarity.Q
-79.9797
10.2724 -7.786 8.37e-15 ***
clarity.C
57.5250
8.2458
6.976 3.43e-12 ***
clarity^4
-36.7786
6.6591 -5.523 3.50e-08 ***
clarity^5
1.7895
5.5359
0.323 0.746521
clarity^6
17.2801
5.0240
3.439 0.000588 ***
clarity^7
14.7822
4.6156
3.203 0.001370 **
color.L
-172.2569
8.0063 -21.515 < 2e-16 ***
color.Q
-27.3245
5.8839 -4.644 3.51e-06 ***
color.C
-13.3488
5.3907 -2.476 0.013309 *
color^4
4.9552
5.0367
0.984 0.325251
color^5
-3.7985
4.6906 -0.810 0.418089
color^6
6.2334
4.2783
1.457 0.145190
table
-0.9325
0.9375 -0.995 0.319938
y
294.9346
16.8682 17.485 < 2e-16 ***
z
119.8985
17.1153
7.005 2.79e-12 ***
fprice.L
3103.1795
31.1688 99.560 < 2e-16 ***
fprice.Q
238.9273
26.0004
9.189 < 2e-16 ***
fprice.C
-126.1668
21.6234 -5.835 5.73e-09 ***
fprice^4
45.2983
15.5962
2.904 0.003695 **
fprice^5
-53.8780
10.0269 -5.373 8.08e-08 ***
12

##
##
##
##
##
##
##
##
##

fprice^6
40.1292
6.7176
5.974 2.48e-09 ***
fprice^7
-14.4412
5.6136 -2.573 0.010125 *
fprice^8
3.7922
5.3327
0.711 0.477048
--Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 122.7 on 4958 degrees of freedom
Multiple R-squared: 0.9897, Adjusted R-squared: 0.9897
F-statistic: 1.167e+04 on 41 and 4958 DF, p-value: < 2.2e-16

2.6.2

Determine the price range

# Predicting the price range


dataset.predict <- dataset.pr
pred.fprice <- predict(modFit, dataset.predict)
dataset.predict <- data.frame(dataset.predict, pred.fprice)
# Checking accuracy
table(dataset.predict$fprice, dataset.predict$pred.fprice)
##
##
##
##
##
##
##
##
##
##
##
2.6.3

2
3
4
5
6
7
8
9
10

2
3
5
2
0 1085
0
21
0
1
0
0
0
0
0
0
0
0
0
0

4
0
16
806
35
0
0
0
0
0

5
0
4
26
580
40
7
0
1
0

6
0
0
2
34
466
36
6
1
1

7
0
0
0
3
36
370
20
8
0

8
0
0
0
3
4
22
331
15
7

9
0
0
0
0
0
10
45
439
42

10
0
0
0
0
1
2
6
48
413

Determine the price

# Predicting the price


pred.price <- predict(fitted.model, dataset.predict)
pred.price <- round(pred.price,0)
dataset.predict <- data.frame(dataset.predict, pred.price)

2.7
2.7.1

Analysing the Residuals


Checking unidentified patterns in the Residuals

x <- dataset.pr$price;
y <- resid(fitted.model)
ggplot(data.frame(x, y), aes(x,y)) +
geom_hline(yintercept=0, size=1) +
geom_point(size=3, colour="black", alpha = 0.1) +
13

geom_point(size=2, colour="salmon", alpha = 0.2) +


xlab("Fitted value") +
ylab("Residual") +
geom_smooth(method="loess", colour="red", lwd=1)

500

Residual

250

250

500
1000

2000

3000

Fitted value
2.7.2
g
g
g
g
g
g
g
g

<<<<<<<-

Plotting the predicted data with actual data


ggplot(dataset.predict, aes(y = price, x = pred.price))
g + geom_point(size=3, colour="black", alpha = 0.1)
g + geom_point(size=2, colour="salmon", alpha = 0.2)
g + ylab("Actual Price")
g + xlab("Predicted Price")
g + geom_smooth(method=loess, col="blue", lwd=1)
g + geom_smooth(method=lm, col="red", lwd=1)

14

4000

5000

5000

Actual Price

4000

3000

2000

1000

1000

2000

3000

4000

Predicted Price
2.7.3

Plotting the difference between actuals and the prediction

# Determine the difference between prediction and actuals


x <- dataset.predict$pred.price - dataset.predict$price
# Summary of x
summary(x)
##
Min.
## -654.0000

1st Qu.
-90.0000

Median
1.0000

Mean
0.0002

3rd Qu.
91.0000

Max.
478.0000

# Plotting the histogram


myhist <- hist(x, breaks=10, density=10, col="darkgrey",
xlab="Price difference",
main="Difference between actuals and prediction")
# Adding a vertical line for the mean
abline(v=mean(x), col="darkgreen", lwd=2)
# Plotting the density curve
multiplier <- myhist$counts / myhist$density
mydensity <- density(x)
mydensity$y <- mydensity$y * multiplier[1]
lines(mydensity, col="blue", lwd=2)
# Plotting the normal curve with the same mean and Standard deviation
xfit <- seq(min(x), max(x), length=40)
yfit <- dnorm(xfit, mean = mean(x), sd = sd(x))
yfit <- yfit * diff(myhist$mids[1:2]) * length(x)

15

5000

lines(xfit, yfit, col="red", lwd=2)


# Add legend
legend('topright', c("Mean", "Density Curve", "Normal Curve"),
lty=c(1,1,1), lwd=c(2,2,2), col = c("darkgreen", "blue", "red"))

600

1000

Mean
Density Curve
Normal Curve

200

Frequency

1400

Difference between actuals and prediction

600

400

200

200

400

Price difference
2.7.4

Plotting the difference between actuals and the prediction (in %age)

# Determine the difference between prediction and actuals


x <- (dataset.predict$pred.price - dataset.predict$price) /dataset.predict$price
x <- round(x*100,2)
# Summary of %age variance
summary(x)
##
Min.
## -28.3300

1st Qu.
-3.5000

Median
0.0450

Mean
0.3333

3rd Qu.
3.9100

Max.
26.2500

# Plotting the histogram


myhist <- hist(x, breaks=10, density=10, col="darkgrey",
xlab="Price difference",
main="Difference between actuals and prediction")
# Adding a vertical line for the mean
abline(v=mean(x), col="darkgreen", lwd=2)
# Plotting the density curve
multiplier <- myhist$counts / myhist$density
mydensity <- density(x)
mydensity$y <- mydensity$y * multiplier[1]
16

lines(mydensity, col="blue", lwd=2)


# Plotting the normal curve with the same mean and Standard deviation
xfit <- seq(min(x), max(x), length=40)
yfit <- dnorm(xfit, mean = mean(x), sd = sd(x))
yfit <- yfit * diff(myhist$mids[1:2]) * length(x)
lines(xfit, yfit, col="red", lwd=2)
# Add legend
legend('topright', c("Mean", "Density Curve", "Normal Curve"),
lty=c(1,1,1), lwd=c(2,2,2), col = c("darkgreen", "blue", "red"))

Difference between actuals and prediction

1000
500
0

Frequency

1500

Mean
Density Curve
Normal Curve

30

20

10

0
Price difference

2.7.5

Determine model RMSE

model.rmse<- sqrt(mean(residuals(fitted.model)^2))
model.rmse
## [1] 122.1546

17

10

20

30

S-ar putea să vă placă și