There are two common implementations of GAMs in R. The older version (originally made for S-PLUS) is available as the ‘gam’ package by Hastie and Tibshirani. The newer version that we will use below is the ‘mgcv’ package from Simon Wood. The basic modeling procedure for both packages is similar (the function is gam for both; be wary of having both libraries loaded at the same time), but the behind-the-scenes computational approaches differ, as do the arguments for optimization and the model output. Expect the results to be slightly different when used with the same model structure on the same dataset.
library(MASS)
set.seed(1234)
sample_index <- sample(nrow(Boston),nrow(Boston)*0.70)
Boston_train <- Boston[sample_index,]
Boston_test <- Boston[-sample_index,]
str(Boston_train)
Model and plots
library(mgcv)
#create gam model
Boston_gam <- gam(medv ~ s(crim)+s(zn)+s(indus)+chas+s(nox)
+s(rm)+s(age)+s(dis)+rad+s(tax)+s(ptratio)
+s(black)+s(lstat),data=Boston_train)
summary(Boston_gam)
##
## Family: gaussian
## Link function: identity
##
## Formula:
## medv ~ s(crim) + s(zn) + s(indus) + chas + s(nox) + s(rm) + s(age) +
## s(dis) + rad + s(tax) + s(ptratio) + s(black) + s(lstat)
##
## Parametric coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 18.9672 1.4638 12.958 <2e-16 ***
## chas 1.0098 0.7104 1.421 0.1562
## rad 0.3454 0.1505 2.296 0.0224 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Approximate significance of smooth terms:
## edf Ref.df F p-value
## s(crim) 4.543 5.488 7.436 1.29e-06 ***
## s(zn) 1.000 1.000 0.973 0.32465
## s(indus) 6.701 7.629 4.124 0.00018 ***
## s(nox) 8.946 8.995 17.931 < 2e-16 ***
## s(rm) 7.792 8.605 19.944 < 2e-16 ***
## s(age) 2.415 3.058 1.302 0.28297
## s(dis) 8.742 8.975 8.895 < 2e-16 ***
## s(tax) 6.122 7.145 7.091 < 2e-16 ***
## s(ptratio) 1.000 1.000 21.257 6.20e-06 ***
## s(black) 1.000 1.000 0.002 0.96366
## s(lstat) 6.362 7.484 18.551 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## R-sq.(adj) = 0.893 Deviance explained = 91%
## GCV = 10.713 Scale est. = 8.9693 n = 354
plot(Boston_gam, pages=1)
Model AIC/BIC and mean residual deviance
AIC(Boston_gam)
BIC(Boston_gam)
Boston_gam$deviance
In-sample fit performance
#in-sample mse using df
Boston_gam.mse.train <- Boston_gam$dev/Boston_gam$df.residual
#Average Sum of Squared Error
Boston_gam.mse.train <- Boston_gam$dev/nrow(Boston_train)
#using the predict() function
pi <- predict(Boston_gam,Boston_train)
mean((pi - Boston_train$medv)^2)
## [1] 7.509325
out of sample performance
pi.out <- predict(Boston_gam,Boston_test)
mean((pi.out - Boston_test$medv)^2)
## [1] 14.04571
The Credit Card Default Data has 10800 observations and 23 predictive variables. The details of the data can be found at http://archive.ics.uci.edu/ml/datasets/default+of+credit+card+clients. Think about what kind of factors could affect people to fail to pay their credit balance.
credit_data <- read.csv(file = "https://yanyudm.github.io/Data-Mining-R/lecture/data/credit_default.csv", header=T)
# rename
library(dplyr)
credit_data<- rename(credit_data, default=default.payment.next.month)
# convert categorical data to factor
credit_data$SEX <- as.factor(credit_data$SEX)
credit_data$EDUCATION <- as.factor(credit_data$EDUCATION)
credit_data$MARRIAGE <- as.factor(credit_data$MARRIAGE)
Now split the data 90/10 as training/testing datasets:
index <- sample(nrow(credit_data),nrow(credit_data)*0.90)
credit_train = credit_data[index,]
credit_test = credit_data[-index,]
Some of these predictors are categorical variables and they will
enter the gam()
model as partially linear terms. We only
add flexible s()
function to those continuous predictor
variables such as LIMIT_BAL, AGE etc. Here we will demonstrate using
five continuous variables as smooth terms and three categorical
variables SEX, EDUCATION, and MARRIAGE as partially linear terms to save
the space of summary output.
## Create a formula for a model with a large number of variables:
gam_formula <- as.formula("default~s(LIMIT_BAL)+s(AGE)+s(PAY_0)+s(BILL_AMT1)+s(PAY_AMT1)+SEX+EDUCATION+MARRIAGE")
credit_gam <- gam(formula = gam_formula, family=binomial, data=credit_train);
summary(credit_gam)
##
## Family: binomial
## Link function: logit
##
## Formula:
## default ~ s(LIMIT_BAL) + s(AGE) + s(PAY_0) + s(BILL_AMT1) + s(PAY_AMT1) +
## SEX + EDUCATION + MARRIAGE
##
## Parametric coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.238725 0.067409 -18.376 < 2e-16 ***
## SEX2 -0.164904 0.053905 -3.059 0.00222 **
## EDUCATION2 -0.055720 0.061951 -0.899 0.36843
## EDUCATION3 -0.203156 0.084394 -2.407 0.01607 *
## EDUCATION4 -1.424943 0.338854 -4.205 2.61e-05 ***
## MARRIAGE2 -0.138104 0.060675 -2.276 0.02284 *
## MARRIAGE3 0.002647 0.233595 0.011 0.99096
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Approximate significance of smooth terms:
## edf Ref.df Chi.sq p-value
## s(LIMIT_BAL) 5.727 6.621 145.428 < 2e-16 ***
## s(AGE) 1.022 1.044 2.329 0.12954
## s(PAY_0) 6.848 7.575 1247.069 < 2e-16 ***
## s(BILL_AMT1) 3.706 4.631 48.692 < 2e-16 ***
## s(PAY_AMT1) 4.981 6.140 26.129 0.00024 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## R-sq.(adj) = 0.206 Deviance explained = 17.6%
## UBRE = -0.1273 Scale est. = 1 n = 10800
plot(credit_gam, shade=TRUE, seWithMean=TRUE, scale=0, pages = 1)
The function vis.gam()
can visualize the nonlinear
relationship between two variables and the linear predictor in a 3D
space as follows:
# vis.gam(credit_gam)
vis.gam(credit_gam, view=c("LIMIT_BAL","AGE"), theta= 140) # different view
In order to see the in-sample fit performance, you may look into the confusion matrix by using commands as following. We assume the cut-off probability as 1/6.
pcut_gam <- 1/6
prob_gam_in <-predict(credit_gam,credit_train,type="response")
pred_gam_in <- (prob_gam_in>=pcut_gam)*1
table(credit_train$default, pred_gam_in,dnn=c("Observed","Predicted"))
## Predicted
## Observed 0 1
## 0 5849 2581
## 1 719 1651
The asymmetric cost with 5:1 cost ratio is:
creditcost <- function(r, pi){
weight1 = 5
weight0 = 1
pcut <- weight0/(weight0+weight1)
c1 = (r==1)&(pi<pcut) #logical vector - true if actual 1 but predict 0
c0 = (r==0)&(pi>pcut) #logical vector - true if actual 0 but predict 1
return(mean(weight1*c1+weight0*c0))
}
creditcost(credit_train$default, pred_gam_in)
## [1] 0.5718519
Training model AIC, BIC, and mean residual deviance:
AIC(credit_gam)
## [1] 9425.127
BIC(credit_gam)
## [1] 9638.527
# credit_gam$deviance
ROC Curve:
library(ROCR)
pred <- prediction(predictions = c(prob_gam_in), labels = credit_train$default)
perf <- performance(pred, "tpr", "fpr")
plot(perf, colorize=TRUE)
#Get the AUC
unlist(slot(performance(pred, "auc"), "y.values"))
## [1] 0.7681283
pcut <- 1/6
prob_gam_out <- predict(credit_gam, credit_test,type="response")
pred_gam_out <- (prob_gam_out>=pcut)*1
table(credit_test$default, pred_gam_out,dnn=c("Observed","Predicted"))
## Predicted
## Observed 0 1
## 0 625 313
## 1 88 174
The asymmetric cost is
creditcost(credit_test$default, pred_gam_out)
## [1] 0.6275
ROC Curve:
pred <- prediction(predictions = c(prob_gam_out), labels = credit_test$default)
perf <- performance(pred, "tpr", "fpr")
plot(perf, colorize=TRUE)
#Get the AUC
unlist(slot(performance(pred, "auc"), "y.values"))
## [1] 0.7306678
Finally, we can also apply gam()
for a univariate
smoothing on the motorcycle data.
library(MASS)
data('mcycle')
str(mcycle)
summary(mcycle)
# Rename the variables for ease of usage
Y <- mcycle$accel
X <- mcycle$times
#Scatterplot
plot(Y~X, xlab="time",ylab="Acceleration", main="Scatterplot of Acceleration against Time")
library(mgcv)
s_gam <- gam(Y ~ s(X),data=mcycle)
summary(s_gam)
##
## Family: gaussian
## Link function: identity
##
## Formula:
## Y ~ s(X)
##
## Parametric coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -25.546 1.951 -13.1 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Approximate significance of smooth terms:
## edf Ref.df F p-value
## s(X) 8.693 8.972 53.52 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## R-sq.(adj) = 0.783 Deviance explained = 79.8%
## GCV = 545.78 Scale est. = 506 n = 133
#plot the model
plot(s_gam, residuals = TRUE, pch = 1)