In this lab, we will cover some state-of-the-art techniques in the framework of tree models. We use the same datasets as in previous lab, Boston Housing data and (Taiwan) Credit Card default data (subsample n=12,000 rows).
# load Boston data
library(MASS)
data(Boston)
index <- sample(nrow(Boston),nrow(Boston)*0.90)
boston_train <- Boston[index,]
boston_test <- Boston[-index,]
# load credit card data
credit_data <- read.csv(file = "https://yanyudm.github.io/Data-Mining-R/lecture/data/credit_default.csv", header=T)
# convert categorical variables
credit_data$SEX<- as.factor(credit_data$SEX)
credit_data$EDUCATION<- as.factor(credit_data$EDUCATION)
credit_data$MARRIAGE<- as.factor(credit_data$MARRIAGE)
# random splitting
index <- sample(nrow(credit_data),nrow(credit_data)*0.60)
credit_train = credit_data[index,]
credit_test = credit_data[-index,]
Boosting builds a number of small trees, and each time, the response is the residual from last tree. It is a sequential procedure. We use gbm
package to build boosted trees. Note: The current gbm
package do not handle asymmetric loss.
library(gbm)
# ?gbm
boston_boost<- gbm(formula = medv~.,
data = boston_train,
distribution = "gaussian",
n.trees = 10000,
shrinkage = 0.01,
interaction.depth = 8)
summary(boston_boost)
## var rel.inf
## rm rm 35.7987234
## lstat lstat 35.3423534
## dis dis 7.8091378
## crim crim 4.9641373
## nox nox 3.9843744
## age age 3.7054899
## black black 2.4844822
## ptratio ptratio 2.1772305
## tax tax 1.8567842
## indus indus 0.9341527
## rad rad 0.5753225
## zn zn 0.2427779
## chas chas 0.1250337
Note that we need to specify distribution = "gaussian"
if we are working on regression tree. The default is Bernoulli distribution for binary classification problem. n.trees
is the number of small trees we fit. We need to choose this parameter carefully because it may results in overfitting if the number is too large. shrinkage
is another tuning parameter that controls how much contribution each tree makes. interaction.depth
is how many splits of each tree we want. All those tuning parameters can be chosen from cross-validation. The idea is that we don’t want overfitting.
The fitted boosted tree also gives the relation between response and each predictor.
par(mfrow=c(1,2))
plot(boston_boost, i="lstat")
plot(boston_boost, i="rm")
Prediction on testing sample.
boston_boost_pred_test<- predict(boston_boost, boston_test, n.trees = 10000)
mean((boston_test$medv-boston_boost_pred_test)^2)
## [1] 15.27358
We can investigate how the testing error changes with different number of trees.
ntree <- seq(100, 10000, 100)
test.err <- rep(0, 13)
predmat <- predict(boston_boost, newdata = boston_test, n.trees = ntree)
err <- apply((predmat-boston_test$medv)^2, 2, mean)
plot(ntree, err, type = 'l', col=2, lwd=2, xlab = "n.trees", ylab = "Test MSE")
abline(h=min(test.err), lty=2)
The horizontal line is the best prediction error from random forests we obtained earlier.
library(adabag)
credit_train$default.payment.next.month= as.factor(credit_train$default.payment.next.month)
credit_boost= boosting(default.payment.next.month~., data = credit_train, boos = T)
save(credit_boost, file = "credit_boost.Rdata")
# Training AUC
pred_credit_boost= predict(credit_boost, newdata = credit_train)
pred <- prediction(pred_credit_boost$prob[,2], credit_train$default.payment.next.month)
perf <- performance(pred, "tpr", "fpr")
plot(perf, colorize=TRUE)
#Get the AUC
unlist(slot(performance(pred, "auc"), "y.values"))
pred_credit_boost= predict(credit_boost, newdata = credit_test)
# Testing AUC
pred <- prediction(pred_credit_boost$prob[,2], credit_test$default.payment.next.month)
perf <- performance(pred, "tpr", "fpr")
plot(perf, colorize=TRUE)
#Get the AUC
unlist(slot(performance(pred, "auc"), "y.values"))