1 Generalized Additive Model

There are two common implementations of GAMs in R. The older version (originally made for S-PLUS) is available as the ‘gam’ package by Hastie and Tibshirani. The newer version that we will use below is the ‘mgcv’ package from Simon Wood. The basic modeling procedure for both packages is similar (the function is gam for both; be wary of having both libraries loaded at the same time), but the behind-the-scenes computational approaches differ, as do the arguments for optimization and the model output. Expect the results to be slightly different when used with the same model structure on the same dataset.

1.1 GAM on Boston Housing dataset

library(MASS)
set.seed(1234)
sample_index <- sample(nrow(Boston),nrow(Boston)*0.70)
Boston_train <- Boston[sample_index,]
Boston_test <- Boston[-sample_index,]
str(Boston_train)

Model and plots

library(mgcv)

#create gam model
Boston_gam <- gam(medv ~ s(crim)+s(zn)+s(indus)+chas+s(nox)
                 +s(rm)+s(age)+s(dis)+rad+s(tax)+s(ptratio)
                 +s(black)+s(lstat),data=Boston_train)

summary(Boston_gam)

## 
## Family: gaussian 
## Link function: identity 
## 
## Formula:
## medv ~ s(crim) + s(zn) + s(indus) + chas + s(nox) + s(rm) + s(age) + 
##     s(dis) + rad + s(tax) + s(ptratio) + s(black) + s(lstat)
## 
## Parametric coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  18.9672     1.4638  12.958   <2e-16 ***
## chas          1.0098     0.7104   1.421   0.1562    
## rad           0.3454     0.1505   2.296   0.0224 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##              edf Ref.df      F  p-value    
## s(crim)    4.543  5.488  7.436 1.29e-06 ***
## s(zn)      1.000  1.000  0.973  0.32465    
## s(indus)   6.701  7.629  4.124  0.00018 ***
## s(nox)     8.946  8.995 17.931  < 2e-16 ***
## s(rm)      7.792  8.605 19.944  < 2e-16 ***
## s(age)     2.415  3.058  1.302  0.28297    
## s(dis)     8.742  8.975  8.895  < 2e-16 ***
## s(tax)     6.122  7.145  7.091  < 2e-16 ***
## s(ptratio) 1.000  1.000 21.257 6.20e-06 ***
## s(black)   1.000  1.000  0.002  0.96366    
## s(lstat)   6.362  7.484 18.551  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## R-sq.(adj) =  0.893   Deviance explained =   91%
## GCV = 10.713  Scale est. = 8.9693    n = 354

plot(Boston_gam, pages=1)

Model AIC/BIC and mean residual deviance

AIC(Boston_gam)
BIC(Boston_gam)
Boston_gam$deviance

In-sample fit performance

#in-sample mse using df 
Boston_gam.mse.train <- Boston_gam$dev/Boston_gam$df.residual 
#Average Sum of Squared Error
Boston_gam.mse.train <- Boston_gam$dev/nrow(Boston_train) 

#using the predict() function
pi <- predict(Boston_gam,Boston_train)
mean((pi - Boston_train$medv)^2)

## [1] 7.509325

out of sample performance

pi.out <- predict(Boston_gam,Boston_test)
mean((pi.out - Boston_test$medv)^2)

## [1] 14.04571

go to top

1.2 GAM on Credit Card Default Data

The Credit Card Default Data has 10800 observations and 23 predictive variables. The details of the data can be found at http://archive.ics.uci.edu/ml/datasets/default+of+credit+card+clients. Think about what kind of factors could affect people to fail to pay their credit balance.

credit_data <- read.csv(file = "https://yanyudm.github.io/Data-Mining-R/lecture/data/credit_default.csv", header=T)

# rename
library(dplyr)
credit_data<- rename(credit_data, default=default.payment.next.month)
# convert categorical data to factor
credit_data$SEX <- as.factor(credit_data$SEX)
credit_data$EDUCATION <- as.factor(credit_data$EDUCATION)
credit_data$MARRIAGE <- as.factor(credit_data$MARRIAGE)

Now split the data 90/10 as training/testing datasets:

index <- sample(nrow(credit_data),nrow(credit_data)*0.90)
credit_train = credit_data[index,]
credit_test = credit_data[-index,]

Some of these predictors are categorical variables and they will enter the gam() model as partially linear terms. We only add flexible s() function to those continuous predictor variables such as LIMIT_BAL, AGE etc. Here we will demonstrate using five continuous variables as smooth terms and three categorical variables SEX, EDUCATION, and MARRIAGE as partially linear terms to save the space of summary output.

## Create a formula for a model with a large number of variables:
gam_formula <- as.formula("default~s(LIMIT_BAL)+s(AGE)+s(PAY_0)+s(BILL_AMT1)+s(PAY_AMT1)+SEX+EDUCATION+MARRIAGE")

credit_gam <- gam(formula = gam_formula, family=binomial, data=credit_train);
summary(credit_gam)

## 
## Family: binomial 
## Link function: logit 
## 
## Formula:
## default ~ s(LIMIT_BAL) + s(AGE) + s(PAY_0) + s(BILL_AMT1) + s(PAY_AMT1) + 
##     SEX + EDUCATION + MARRIAGE
## 
## Parametric coefficients:
##              Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -1.238725   0.067409 -18.376  < 2e-16 ***
## SEX2        -0.164904   0.053905  -3.059  0.00222 ** 
## EDUCATION2  -0.055720   0.061951  -0.899  0.36843    
## EDUCATION3  -0.203156   0.084394  -2.407  0.01607 *  
## EDUCATION4  -1.424943   0.338854  -4.205 2.61e-05 ***
## MARRIAGE2   -0.138104   0.060675  -2.276  0.02284 *  
## MARRIAGE3    0.002647   0.233595   0.011  0.99096    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##                edf Ref.df   Chi.sq p-value    
## s(LIMIT_BAL) 5.727  6.621  145.428 < 2e-16 ***
## s(AGE)       1.022  1.044    2.329 0.12954    
## s(PAY_0)     6.848  7.575 1247.069 < 2e-16 ***
## s(BILL_AMT1) 3.706  4.631   48.692 < 2e-16 ***
## s(PAY_AMT1)  4.981  6.140   26.129 0.00024 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## R-sq.(adj) =  0.206   Deviance explained = 17.6%
## UBRE = -0.1273  Scale est. = 1         n = 10800

plot(credit_gam, shade=TRUE, seWithMean=TRUE, scale=0, pages = 1)

The function vis.gam() can visualize the nonlinear relationship between two variables and the linear predictor in a 3D space as follows:

# vis.gam(credit_gam)
vis.gam(credit_gam, view=c("LIMIT_BAL","AGE"), theta= 140) # different view

1.2.1 In-sample fit performance

In order to see the in-sample fit performance, you may look into the confusion matrix by using commands as following. We assume the cut-off probability as 1/6.

pcut_gam <- 1/6
prob_gam_in <-predict(credit_gam,credit_train,type="response")
pred_gam_in <- (prob_gam_in>=pcut_gam)*1
table(credit_train$default, pred_gam_in,dnn=c("Observed","Predicted"))

##         Predicted
## Observed    0    1
##        0 5849 2581
##        1  719 1651

The asymmetric cost with 5:1 cost ratio is:

creditcost <- function(r, pi){
  weight1 = 5
  weight0 = 1
  pcut <- weight0/(weight0+weight1)
  c1 = (r==1)&(pi<pcut) #logical vector - true if actual 1 but predict 0
  c0 = (r==0)&(pi>pcut) #logical vector - true if actual 0 but predict 1
  return(mean(weight1*c1+weight0*c0))
}
creditcost(credit_train$default, pred_gam_in)

## [1] 0.5718519

Training model AIC, BIC, and mean residual deviance:

AIC(credit_gam)

## [1] 9425.127

BIC(credit_gam)

## [1] 9638.527

# credit_gam$deviance

ROC Curve:

library(ROCR)
pred <- prediction(predictions = c(prob_gam_in), labels = credit_train$default)
perf <- performance(pred, "tpr", "fpr")
plot(perf, colorize=TRUE)

#Get the AUC
unlist(slot(performance(pred, "auc"), "y.values"))

## [1] 0.7681283

1.2.2 Out-of-sample fit performance

pcut <- 1/6
prob_gam_out <- predict(credit_gam, credit_test,type="response")
pred_gam_out <- (prob_gam_out>=pcut)*1
table(credit_test$default, pred_gam_out,dnn=c("Observed","Predicted"))

##         Predicted
## Observed   0   1
##        0 625 313
##        1  88 174

The asymmetric cost is

creditcost(credit_test$default, pred_gam_out)

## [1] 0.6275

ROC Curve:

pred <- prediction(predictions = c(prob_gam_out), labels = credit_test$default)
perf <- performance(pred, "tpr", "fpr")
plot(perf, colorize=TRUE)

#Get the AUC
unlist(slot(performance(pred, "auc"), "y.values"))

## [1] 0.7306678

go to top

1.3 GAM using the “Motorcycle” dataset

Finally, we can also apply gam() for a univariate smoothing on the motorcycle data.

library(MASS)
data('mcycle')
str(mcycle)
summary(mcycle)

# Rename the variables for ease of usage
Y <- mcycle$accel
X <- mcycle$times

#Scatterplot
plot(Y~X, xlab="time",ylab="Acceleration", main="Scatterplot of Acceleration against Time")

library(mgcv)
s_gam <- gam(Y ~ s(X),data=mcycle)
summary(s_gam)

## 
## Family: gaussian 
## Link function: identity 
## 
## Formula:
## Y ~ s(X)
## 
## Parametric coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  -25.546      1.951   -13.1   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##        edf Ref.df     F p-value    
## s(X) 8.693  8.972 53.52  <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## R-sq.(adj) =  0.783   Deviance explained = 79.8%
## GCV = 545.78  Scale est. = 506       n = 133

#plot the model
plot(s_gam, residuals = TRUE, pch = 1)

go to top