Documente Academic
Documente Profesional
Documente Cultură
Prostate Project
Part 1. Describe you data, the purpose of your analysis
Prostate Cancer Stamey et al. (1989) studied potential predictors of prostate-
specific antigen (PSA) in patients. The independent variables include X, log of Cavol,
Log of weight, age, log of bph, svi, log of cp, gleason, and pgg45.
> dim(prostate)
[1] 97 10
> names(prostate)
> prostate[1:5,]
lpsa
1 -0.4307829
2 -0.1625189
3 -0.1625189
4 -0.1625189
5 0.3715636
1
Project 1
> pairs(lpsa~.,data=prostate)
> boxplot(prostate)
2
Project 1
> prostate.train<-
sample(1:nrow(prostate),round(nrow(prostate)/2),replace=FALSE)
> index.train<-sample(1:nrow(prostate),round(nrow(prostate)/2),replace=FALSE)
> prostate.train<-prostate[index.train,]
> prostate.validation<-prostate[-index.train,]
> model1<-
lm(lpsa~1+lcavol+lweight+age+lbph+svi+lcp+gleason+pgg45,data=prostate.train)
> summary(model1)
Call:
Residuals:
Coefficients:
3
Project 1
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Best Subsets
> leaps<-
regsubsets(lpsa~1+lcavol+lweight+age+lbph+svi+lcp+gleason+pgg45,data=prostat
e.train,nbest=1)
> names(leaps)
[1] "np" "nrbar" "d" "rbar" "thetab" "first" "last" "vorder" "tol"
"rss" "bound"
> summary(leaps)
4
Project 1
2 ( 1 ) "*" " " " " " " "*" " " " " ""
3 ( 1 ) "*" "*" " " " " "*" " " " " ""
4 ( 1 ) "*" "*" "*" " " "*" " " " " ""
5 ( 1 ) "*" "*" "*" "*" "*" " " " " ""
> leaps$rss
> plot(leaps$rss)
5
Project 1
> coef(leaps,6)
> library(car)
> subsets(leaps,statistic="rss")
6
Project 1
> subsets(leaps,statistic="cp")
7
Project 1
> subsets(leaps,statistic="bic")
8
Project 1
9
Project 1
> model2<-lm(lpsa~1+lcavol+lweight+age+svi,data=prostate.train)
> summary(model2)
Call:
Residuals:
Coefficients:
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Stepwise Regression
> model<-
lm(lpsa~1+lcavol+lweight+age+lbph+svi+lcp+gleason+pgg45,data=prostate.train)
> steps<-stepAIC(model,direction="both")
Start: AIC=-26.05
10
Project 1
pgg45
Step: AIC=-27.97
11
Project 1
Step: AIC=-29.64
Step: AIC=-31.34
12
Project 1
> model3<-lm(lpsa~1+lcavol+lweight+age+lbph+svi,data=prostate.train)
> summary(model3)
Call:
data = prostate.train)
Residuals:
Coefficients:
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
13
Project 1
Ridge Regression
> model.ridge<-
lm.ridge(lpsa~1+lcavol+lweight+age+lbph+svi+lcp+gleason+pgg45,data=prostate.t
rain,na.action="na.omit",lambda=seq(1,100,.02))
> plot(model.ridge$lambda,model.ridge$GCV)
> model.ridge$lambda[model.ridge$GCV==min(model.ridge$GCV)]
[1] 4.44
> model.ridge<-
lm.ridge(lpsa~1+lcavol+lweight+age+lbph+svi+lcp+gleason+pgg45,data=prostate.t
rain,na.action="na.omit",lambda=4.44)
14
Project 1
> model.ridge
> model.ridge2<-
lm(lpsa~1+lcavol+lweight+age+lbph+svi+lcp+gleason,data=prostate.train)
> model.ridge2
Call:
lm(formula = lpsa ~ 1 + lcavol + lweight + age + lbph + svi + lcp + gleason, data =
prostate.train)
Coefficients:
> summary(model.ridge2)
Call:
Residuals:
Coefficients:
15
Project 1
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
> y.validation<-data.frame(prostate.train[,10])
> predict.model1<-predict(model1,x.validation)
> ls()
16
Project 1
> predict.model2<-predict(model2,x.validation)
> predict.model3<-predict(model3,x.validation)
> predict.model4<-predict(model.ridge2,x.validation)
> rss.model1<-sum((y.validation-predict.model1)^2)
> rss.model2<-sum((y.validation-predict.model2)^2)
> rss.model3<-sum((y.validation-predict.model3)^2)
> rss.model4<-sum((y.validation-predict.model4)^2)
> rss.model1
[1] 19.17387
> rss.model2
[1] 20.51985
> rss.model3
[1] 19.45737
> rss.model4
[1] 19.20584
Call:
Residuals:
17
Project 1
Coefficients:
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
18