본문 바로가기
카테고리 없음

R 소스

by 정보장교 2018. 6. 12.
반응형

library(ElemStatLearn) #contains the data

library(car) #package to calculate Variance Inflation Factor

library(corrplot) #correlation plots

library(leaps) #best subsets regression

library(glmnet) #allows ridge regression, LASSO and elastic net

library(caret) #this will help identify the appropriate parameters

 

data(prostate)

str(prostate)

plot(prostate)

plot(prostate$gleason)

table(prostate$gleason)

boxplot(prostate$lpsa~prostate$gleason, xlab="Gleason Score", ylab="Log of PSA")

 

prostate$gleason = ifelse(prostate$gleason == 6, 0, 1)

table(prostate$gleason)

 

p.cor = cor(prostate)

corrplot.mixed(p.cor)

 

train = subset(prostate, train==TRUE)[,1:9]

str(train)

test = subset(prostate, train==FALSE)[,1:9]

str(test)

 

subfit = regsubsets(lpsa~., data=train)

b.sum = summary(subfit)

which.min(b.sum$bic)

plot(b.sum$bic, type="l", xlab="# of Features", ylab="BIC", main="BIC score by Feature Inclusion")

plot(subfit, scale="bic", main="Best Subset Features")

 

ols = lm(lpsa~lcavol+lweight+gleason, data=train)

plot(ols$fitted.values, train$lpsa, xlab="Predicted", ylab="Actual", main="Predicted vs Actual")

###

pred.subfit = predict(ols, newdata=test)

plot(pred.subfit, test$lpsa , xlab="Predicted", ylab="Actual", main="Predicted vs Actual")

resid.subfit = test$lpsa - pred.subfit

mean(resid.subfit^2)

 

x = as.matrix(train[,1:8])

y = train[ ,9]

ridge = glmnet(x, y, family="gaussian", alpha=0)

print(ridge)

plot(ridge, label=TRUE)

plot(ridge, xvar="lambda", label=TRUE)

ridge.coef = coef(ridge, s=0.1, exact=TRUE)

ridge.coef

plot(ridge, xvar="dev", label=TRUE)

 

newx = as.matrix(test[,1:8])

ridge.y = predict(ridge, newx=newx, type="response", s=0.1)

plot(ridge.y, test$lpsa, xlab="Predicted", ylab="Actual", main="Ridge Regression")

ridge.resid = ridge.y - test$lpsa

mean(ridge.resid^2)

 

lasso = glmnet(x, y, family="gaussian", alpha=1)

print(lasso)

plot(lasso, xvar="lambda", label=TRUE)

lasso.coef = coef(lasso, s=0.045, exact=TRUE)

lasso.coef

lasso.y = predict(lasso, newx=newx, type="response", s=0.045)

plot(lasso.y, test$lpsa, xlab="Predicted", ylab="Actual", main="LASSO")

lasso.resid = lasso.y - test$lpsa

mean(lasso.resid^2)

 

grid = expand.grid(.alpha=seq(0,1, by=.2), .lambda=seq(0.00,0.2, by=0.02))

table(grid)

head(grid)

control = trainControl(method="LOOCV") #selectionFunction="best"

set.seed(701) #our random seed

enet.train = train(lpsa~., data=train, method="glmnet", trControl=control, tuneGrid=grid)

enet.train

 

enet = glmnet(x, y,family="gaussian", alpha=0, lambda=.08)

enet.coef = coef(enet, s=.08, exact=TRUE)

enet.coef

enet.y = predict(enet, newx=newx, type="response", s=.08)

plot(enet.y, test$lpsa, xlab="Predicted", ylab="Actual", main="Elastic Net")

enet.resid = enet.y - test$lpsa

mean(enet.resid^2)

 

set.seed(317)

lasso.cv = cv.glmnet(x, y)

plot(lasso.cv)

lasso.cv$lambda.min #minimum

lasso.cv$lambda.1se #one standard error away

coef(lasso.cv, s ="lambda.1se")

lasso.y.cv = predict(lasso.cv, newx=newx, type="response", s="lambda.1se")

lasso.cv.resid = lasso.y.cv - test$lpsa

mean(lasso.cv.resid^2)


반응형