First, load the R packages to work with in this analysis.

library(tidyverse)
library(magrittr)
library(ggplot2)
library(plyr)
library(gridExtra)
library(ggthemes)
library(caret)
library(MASS)
library(randomForest)
library(party)
library(corrplot)
library(fastDummies)
library(tree)
library(randomForest)
library(gbm)

1 Perform Data Munging Tasks

Routine data cleaning and structuring in preparation for the analysis and model building.

ChurnURL <- 'https://community.watsonanalytics.com/wp-content/uploads/2015/03/WA_Fn-UseC_-Telco-Customer-Churn.csv'
ChurnData <- read.csv(url(ChurnURL), sep = ",", header = TRUE, stringsAsFactors = FALSE)
ChurnData <- as.tibble(ChurnData)
str(ChurnData)
## Classes 'tbl_df', 'tbl' and 'data.frame':    7043 obs. of  21 variables:
##  $ customerID      : chr  "7590-VHVEG" "5575-GNVDE" "3668-QPYBK" "7795-CFOCW" ...
##  $ gender          : chr  "Female" "Male" "Male" "Male" ...
##  $ SeniorCitizen   : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ Partner         : chr  "Yes" "No" "No" "No" ...
##  $ Dependents      : chr  "No" "No" "No" "No" ...
##  $ tenure          : int  1 34 2 45 2 8 22 10 28 62 ...
##  $ PhoneService    : chr  "No" "Yes" "Yes" "No" ...
##  $ MultipleLines   : chr  "No phone service" "No" "No" "No phone service" ...
##  $ InternetService : chr  "DSL" "DSL" "DSL" "DSL" ...
##  $ OnlineSecurity  : chr  "No" "Yes" "Yes" "Yes" ...
##  $ OnlineBackup    : chr  "Yes" "No" "Yes" "No" ...
##  $ DeviceProtection: chr  "No" "Yes" "No" "Yes" ...
##  $ TechSupport     : chr  "No" "No" "No" "Yes" ...
##  $ StreamingTV     : chr  "No" "No" "No" "No" ...
##  $ StreamingMovies : chr  "No" "No" "No" "No" ...
##  $ Contract        : chr  "Month-to-month" "One year" "Month-to-month" "One year" ...
##  $ PaperlessBilling: chr  "Yes" "No" "Yes" "No" ...
##  $ PaymentMethod   : chr  "Electronic check" "Mailed check" "Mailed check" "Bank transfer (automatic)" ...
##  $ MonthlyCharges  : num  29.9 57 53.9 42.3 70.7 ...
##  $ TotalCharges    : num  29.9 1889.5 108.2 1840.8 151.7 ...
##  $ Churn           : chr  "No" "No" "Yes" "No" ...
sapply(ChurnData, function(x) sum(is.na(x)))
##       customerID           gender    SeniorCitizen          Partner 
##                0                0                0                0 
##       Dependents           tenure     PhoneService    MultipleLines 
##                0                0                0                0 
##  InternetService   OnlineSecurity     OnlineBackup DeviceProtection 
##                0                0                0                0 
##      TechSupport      StreamingTV  StreamingMovies         Contract 
##                0                0                0                0 
## PaperlessBilling    PaymentMethod   MonthlyCharges     TotalCharges 
##                0                0                0               11 
##            Churn 
##                0
ChurnData$TotalCharges[is.na(ChurnData$TotalCharges)] <- 0
sum(is.na(ChurnData$TotalCharges))
## [1] 0
ChurnData$gender <- as.factor(mapvalues(ChurnData$gender,
                                      from = c('Female', 'Male'), to = c(-1, 1)))
ChurnData$SeniorCitizen <- as.factor(mapvalues(ChurnData$SeniorCitizen,
                                      from = c('0', '1'), to = c('No', 'Yes')))
ChurnData$MultipleLines <- as.factor(mapvalues(ChurnData$MultipleLines,
                                from = 'No phone service', to = 'No'))
ChurnData$OnlineSecurity <- as.factor(mapvalues(ChurnData$OnlineSecurity,
                                      from = 'No internet service', to = 'No'))
ChurnData$OnlineBackup <- as.factor(mapvalues(ChurnData$OnlineBackup,
                                      from = 'No internet service', to = 'No'))
ChurnData$DeviceProtection <- as.factor(mapvalues(ChurnData$DeviceProtection,
                                      from = 'No internet service', to = 'No'))
ChurnData$TechSupport <- as.factor(mapvalues(ChurnData$TechSupport,
                                      from = 'No internet service', to = 'No'))
ChurnData$StreamingTV <- as.factor(mapvalues(ChurnData$StreamingTV,
                                      from = 'No internet service', to = 'No'))
ChurnData$StreamingMovies <- as.factor(mapvalues(ChurnData$StreamingMovies,
                                      from = 'No internet service', to = 'No'))
ChurnData$Churn <- as.factor(mapvalues(ChurnData$Churn,
                                       from = c('No', 'Yes'), to = c('0', '1')))
min(ChurnData$tenure); max(ChurnData$tenure)
## [1] 0
## [1] 72
ChurnData$tenure_group <- cut(ChurnData$tenure, breaks = c(-1, 6, 12, 18, 24, 30, 
                                                           36, 42, 48, 54, 60, 66, 72),
                              labels = c('0 - 6 Mos', '7 - 12 Mos', '13 - 18 Mos', 
                                         '19 - 24 Mos','25 - 30 Mos', '31 - 36 Mos',
                                         '37 - 42 Mos', '43 - 48 Mos','49 - 54 Mos', 
                                         '55 - 60 Mos','61 - 66 Mos', '67 - 72 Mos'))
ChurnData$customerID <- NULL; ChurnData$tenure <- NULL

Check for any significant correlation levels between features.

numeric <- sapply(ChurnData, is.numeric)
CorrTable <- cor(ChurnData[, numeric])
CorrTable
##                MonthlyCharges TotalCharges
## MonthlyCharges         1.0000       0.6512
## TotalCharges           0.6512       1.0000
ChurnData$MonthlyCharges <- NULL

MonthlyCharges and TotalCharges are highly correlated so we remove one feature – Monthly Charges – before modelling the data.

2 Visualizing Customer Characteristics

Plot features that might help with understanding churn. Note: in the Gender plot, -1 designates Female.

These features represent characteristics of service subscribers.

p1 <- ggplot(ChurnData, aes(x = gender)) + geom_bar(aes( y = 100 * (..count..) /
        sum(..count..)), width = 0.5) + ggtitle('Gender') + xlab('Gender') + 
        ylab('Percentage') + coord_flip() + theme_economist()
p2 <- ggplot(ChurnData, aes(x = SeniorCitizen)) + geom_bar(aes( y = 100 * (..count..) /
        sum(..count..)), width = 0.5) + ggtitle('Senior Citizen') + 
        xlab('Senior Citizen') + ylab('Percentage') + coord_flip() + theme_economist() 
p3 <- ggplot(ChurnData, aes(x = Partner)) + geom_bar(aes( y = 100 * (..count..) /
        sum(..count..)), width = 0.5) + ggtitle('Has a Partner') + 
        xlab('Partner') + ylab('Percentage') + coord_flip() + theme_economist()
p4 <- ggplot(ChurnData, aes(x = Dependents)) + geom_bar(aes( y = 100 * (..count..) /
        sum(..count..)), width = 0.5) + ggtitle('Has Dependents') + 
        xlab('Dependents') + ylab('Percentage') + coord_flip() + theme_economist()
grid.arrange(p1, p2, p3, p4, ncol = 2)

p5 <- ggplot(ChurnData, aes(x = PhoneService)) + geom_bar(aes( y = 100 * (..count..) /
        sum(..count..)), width = 0.5) + ggtitle('Phone Service') + xlab('Phone Service') + 
        ylab('Percentage') + coord_flip() + theme_economist()
p6 <- ggplot(ChurnData, aes(x = MultipleLines)) + geom_bar(aes( y = 100 * (..count..) /
        sum(..count..)), width = 0.5) + ggtitle('Multiple Lines') + 
        xlab('Multiple Lines') + ylab('Percentage') + coord_flip() + theme_economist() 
p7 <- ggplot(ChurnData, aes(x = InternetService)) + geom_bar(aes( y = 100 * (..count..) /
        sum(..count..)), width = 0.5) + ggtitle('Internet Service') + 
        xlab('Internet Service') + ylab('Percentage') + coord_flip() + theme_economist()
p8 <- ggplot(ChurnData, aes(x = OnlineSecurity)) + geom_bar(aes( y = 100 * (..count..) /
        sum(..count..)), width = 0.5) + ggtitle('Online Security') + 
        xlab('Online Security') + ylab('Percentage') + coord_flip() + theme_economist()
grid.arrange(p5, p6, p7, p8, ncol = 2)

p9 <- ggplot(ChurnData, aes(x = OnlineBackup)) + geom_bar(aes( y = 100 * (..count..) /
        sum(..count..)), width = 0.5) + ggtitle('Online Backup') + xlab('Online Backup') + 
        ylab('Percentage') + coord_flip() + theme_economist()
p10 <- ggplot(ChurnData, aes(x = DeviceProtection)) + geom_bar(aes( y = 100 * (..count..) /
        sum(..count..)), width = 0.5) + ggtitle('Device Protection') + 
        xlab('Device Protection') + ylab('Percentage') + coord_flip() + theme_economist() 
p11 <- ggplot(ChurnData, aes(x = TechSupport)) + geom_bar(aes( y = 100 * (..count..) /
        sum(..count..)), width = 0.5) + ggtitle('Tech Support') + 
        xlab('Tech Support') + ylab('Percentage') + coord_flip() + theme_economist()
p12 <- ggplot(ChurnData, aes(x = StreamingTV)) + geom_bar(aes( y = 100 * (..count..) /
        sum(..count..)), width = 0.5) + ggtitle('Streaming TV') + 
        xlab('Streaming TV') + ylab('Percentage') + coord_flip() + theme_economist()
grid.arrange(p9, p10, p11, p12, ncol = 2)

p13 <- ggplot(ChurnData, aes(x = StreamingMovies)) + geom_bar(aes( y = 100 * (..count..) /
        sum(..count..)), width = 0.5) + ggtitle('Streaming Movies') + xlab('StreamingMovies') + 
        ylab('Percentage') + coord_flip() + theme_economist()
p14 <- ggplot(ChurnData, aes(x = Contract)) + geom_bar(aes( y = 100 * (..count..) /
        sum(..count..)), width = 0.5) + ggtitle('Contract Type') + 
        xlab('Contract') + ylab('Percentage') + coord_flip() + theme_economist() 
p15 <- ggplot(ChurnData, aes(x = PaperlessBilling)) + geom_bar(aes( y = 100 * (..count..) /
        sum(..count..)), width = 0.5) + ggtitle('Paperless Billing') + 
        xlab('Paperless Billing') + ylab('Percentage') + coord_flip() + theme_economist()
p16 <- ggplot(ChurnData, aes(x = PaymentMethod)) + geom_bar(aes( y = 100 * (..count..) /
        sum(..count..)), width = 0.5) + ggtitle('Payment') + 
        xlab('Payment Method') + ylab('Percentage') + coord_flip() + theme_economist()
grid.arrange(p13, p14, p15, p16, ncol = 2)

Notice, over half of all customers are on a monthly contract.

The length of time a customer has been with the service may be important to churn. Notice the largest group of customers are the newest, with less than six months service. Half of them are lost by the end of the first year.

p17 <- ggplot(ChurnData, aes(x = tenure_group)) + geom_bar(aes( y = 100 * (..count..) /
        sum(..count..)), width = 0.5) + ggtitle('Tenure Groups') + 
        xlab('Tenure Groups') + ylab('Percentage') + coord_flip() + theme_economist()
p17

Historically, 26% of customers in this data set churn. If we were to guess all customers stay, we would be right 74% of the time. That is, a naive approach of guessing the same result in all instances would have an accuracy rate of 74%. What we want to see in this study is if modelling the problem provides any improvement. In other words, does analysis and modelling add information?

p18 <- ggplot(ChurnData, aes(x = Churn)) + geom_bar(aes( y = 100 * (..count..) /
        sum(..count..)), width = 0.5) + ggtitle('Customer Churn') + 
        xlab('Churn') + ylab('Percentage') + coord_flip() + theme_economist()
p18

3 Final Data Prep for Modelling

More data preparation for modelling. Here we create dummy variables and remove redundant features.

ChurnDataDum <- dummy_cols(ChurnData)
ChurnTenure <- ChurnDataDum[, 19]
Churn <- ChurnDataDum[, 18]
ChurnDataDum <- ChurnDataDum[, -c(2:16, 18:22, 25:26, 28, 30, 34:35, 38:39, 41, 43, 45, 49, 51, 55:57, 69)]

Create training and testing subsets of the full data set. The data are split 70% for training and 30% for testing.

ChurnDataDum <- cbind(Churn, ChurnDataDum)
intrain <- createDataPartition(ChurnDataDum$Churn, p = 0.7, list = FALSE)
set.seed(2017)
training <- ChurnDataDum[intrain, ]
testing <- ChurnDataDum[-intrain, ]
dim(training); dim(testing)
## [1] 4931   33
## [1] 2112   33

4 Logistic Regression Models

Build the first log regression model.

LogModel <- glm(Churn ~. , family = binomial(link = 'logit'), data = training)
LogModel
## 
## Call:  glm(formula = Churn ~ ., family = binomial(link = "logit"), data = training)
## 
## Coefficients:
##                               (Intercept)  
##                                 -4.337425  
##                                   gender1  
##                                  0.005377  
##                              TotalCharges  
##                                  0.000102  
##                         SeniorCitizen_Yes  
##                                  0.394330  
##                               Partner_Yes  
##                                  0.112211  
##                            Dependents_Yes  
##                                 -0.191822  
##                          PhoneService_Yes  
##                                 -0.624438  
##                         MultipleLines_Yes  
##                                  0.308709  
##                       InternetService_DSL  
##                                  0.741352  
##             `InternetService_Fiber optic`  
##                                  1.682586  
##                        OnlineSecurity_Yes  
##                                 -0.453168  
##                          OnlineBackup_Yes  
##                                 -0.322863  
##                      DeviceProtection_Yes  
##                                 -0.011970  
##                           TechSupport_Yes  
##                                 -0.286618  
##                           StreamingTV_Yes  
##                                  0.229010  
##                       StreamingMovies_Yes  
##                                  0.283132  
##                 `Contract_Month-to-month`  
##                                  1.688480  
##                       `Contract_One year`  
##                                  0.992836  
##                      PaperlessBilling_Yes  
##                                  0.371091  
##          `PaymentMethod_Electronic check`  
##                                  0.370579  
##              `PaymentMethod_Mailed check`  
##                                  0.011406  
## `PaymentMethod_Bank transfer (automatic)`  
##                                  0.071872  
##                  `tenure_group_0 - 6 Mos`  
##                                  1.920175  
##                `tenure_group_31 - 36 Mos`  
##                                 -0.020716  
##                `tenure_group_43 - 48 Mos`  
##                                 -0.040279  
##                 `tenure_group_7 - 12 Mos`  
##                                  1.079799  
##                `tenure_group_19 - 24 Mos`  
##                                  0.299911  
##                `tenure_group_25 - 30 Mos`  
##                                  0.253399  
##                `tenure_group_61 - 66 Mos`  
##                                 -0.783714  
##                `tenure_group_13 - 18 Mos`  
##                                  0.707674  
##                `tenure_group_55 - 60 Mos`  
##                                 -0.572715  
##                `tenure_group_49 - 54 Mos`  
##                                 -0.070953  
##                `tenure_group_67 - 72 Mos`  
##                                 -0.595183  
## 
## Degrees of Freedom: 4930 Total (i.e. Null);  4898 Residual
## Null Deviance:       5710 
## Residual Deviance: 4010  AIC: 4070

A summary of the first model indicates factors that are most important to understanding churn. Those below marked with asterisks are statistically significant. But we will not need so many to understand churn as we proceed with the analysis.

summary(LogModel)
## 
## Call:
## glm(formula = Churn ~ ., family = binomial(link = "logit"), data = training)
## 
## Deviance Residuals: 
##    Min      1Q  Median      3Q     Max  
## -2.200  -0.682  -0.279   0.608   3.290  
## 
## Coefficients:
##                                            Estimate Std. Error z value
## (Intercept)                               -4.34e+00   3.76e-01  -11.54
## gender1                                    5.38e-03   7.86e-02    0.07
## TotalCharges                               1.02e-04   8.46e-05    1.20
## SeniorCitizen_Yes                          3.94e-01   1.01e-01    3.89
## Partner_Yes                                1.12e-01   9.52e-02    1.18
## Dependents_Yes                            -1.92e-01   1.10e-01   -1.75
## PhoneService_Yes                          -6.24e-01   1.63e-01   -3.82
## MultipleLines_Yes                          3.09e-01   9.80e-02    3.15
## InternetService_DSL                        7.41e-01   1.66e-01    4.45
## `InternetService_Fiber optic`              1.68e+00   1.72e-01    9.75
## OnlineSecurity_Yes                        -4.53e-01   1.04e-01   -4.37
## OnlineBackup_Yes                          -3.23e-01   9.47e-02   -3.41
## DeviceProtection_Yes                      -1.20e-02   9.71e-02   -0.12
## TechSupport_Yes                           -2.87e-01   1.05e-01   -2.74
## StreamingTV_Yes                            2.29e-01   1.01e-01    2.27
## StreamingMovies_Yes                        2.83e-01   1.00e-01    2.82
## `Contract_Month-to-month`                  1.69e+00   2.26e-01    7.47
## `Contract_One year`                        9.93e-01   2.21e-01    4.48
## PaperlessBilling_Yes                       3.71e-01   9.03e-02    4.11
## `PaymentMethod_Electronic check`           3.71e-01   1.17e-01    3.18
## `PaymentMethod_Mailed check`               1.14e-02   1.41e-01    0.08
## `PaymentMethod_Bank transfer (automatic)`  7.19e-02   1.37e-01    0.52
## `tenure_group_0 - 6 Mos`                   1.92e+00   2.94e-01    6.53
## `tenure_group_31 - 36 Mos`                -2.07e-02   2.48e-01   -0.08
## `tenure_group_43 - 48 Mos`                -4.03e-02   2.61e-01   -0.15
## `tenure_group_7 - 12 Mos`                  1.08e+00   2.87e-01    3.76
## `tenure_group_19 - 24 Mos`                 3.00e-01   2.65e-01    1.13
## `tenure_group_25 - 30 Mos`                 2.53e-01   2.54e-01    1.00
## `tenure_group_61 - 66 Mos`                -7.84e-01   3.48e-01   -2.25
## `tenure_group_13 - 18 Mos`                 7.08e-01   2.75e-01    2.57
## `tenure_group_55 - 60 Mos`                -5.73e-01   3.01e-01   -1.90
## `tenure_group_49 - 54 Mos`                -7.10e-02   2.66e-01   -0.27
## `tenure_group_67 - 72 Mos`                -5.95e-01   3.74e-01   -1.59
##                                           Pr(>|z|)    
## (Intercept)                                < 2e-16 ***
## gender1                                    0.94545    
## TotalCharges                               0.22987    
## SeniorCitizen_Yes                          0.00010 ***
## Partner_Yes                                0.23842    
## Dependents_Yes                             0.08000 .  
## PhoneService_Yes                           0.00013 ***
## MultipleLines_Yes                          0.00162 ** 
## InternetService_DSL                        8.5e-06 ***
## `InternetService_Fiber optic`              < 2e-16 ***
## OnlineSecurity_Yes                         1.2e-05 ***
## OnlineBackup_Yes                           0.00066 ***
## DeviceProtection_Yes                       0.90190    
## TechSupport_Yes                            0.00618 ** 
## StreamingTV_Yes                            0.02320 *  
## StreamingMovies_Yes                        0.00479 ** 
## `Contract_Month-to-month`                  8.3e-14 ***
## `Contract_One year`                        7.3e-06 ***
## PaperlessBilling_Yes                       4.0e-05 ***
## `PaymentMethod_Electronic check`           0.00148 ** 
## `PaymentMethod_Mailed check`               0.93535    
## `PaymentMethod_Bank transfer (automatic)`  0.60077    
## `tenure_group_0 - 6 Mos`                   6.4e-11 ***
## `tenure_group_31 - 36 Mos`                 0.93340    
## `tenure_group_43 - 48 Mos`                 0.87750    
## `tenure_group_7 - 12 Mos`                  0.00017 ***
## `tenure_group_19 - 24 Mos`                 0.25759    
## `tenure_group_25 - 30 Mos`                 0.31792    
## `tenure_group_61 - 66 Mos`                 0.02441 *  
## `tenure_group_13 - 18 Mos`                 0.01014 *  
## `tenure_group_55 - 60 Mos`                 0.05696 .  
## `tenure_group_49 - 54 Mos`                 0.78999    
## `tenure_group_67 - 72 Mos`                 0.11150    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 5707.1  on 4930  degrees of freedom
## Residual deviance: 4005.5  on 4898  degrees of freedom
## AIC: 4072
## 
## Number of Fisher Scoring iterations: 6

The analysis of variance (ANOVA) gives similar results regarding the importance of specific factors.

anova(LogModel, test = 'Chisq')
## Analysis of Deviance Table
## 
## Model: binomial, link: logit
## 
## Response: Churn
## 
## Terms added sequentially (first to last)
## 
## 
##                                           Df Deviance Resid. Df Resid. Dev
## NULL                                                       4930       5707
## gender                                     1        0      4929       5707
## TotalCharges                               1      188      4928       5519
## SeniorCitizen_Yes                          1      199      4927       5320
## Partner_Yes                                1       31      4926       5288
## Dependents_Yes                             1       59      4925       5229
## PhoneService_Yes                           1        6      4924       5223
## MultipleLines_Yes                          1       74      4923       5149
## InternetService_DSL                        1       64      4922       5085
## `InternetService_Fiber optic`              1      668      4921       4417
## OnlineSecurity_Yes                         1       48      4920       4370
## OnlineBackup_Yes                           1        9      4919       4360
## DeviceProtection_Yes                       1        0      4918       4360
## TechSupport_Yes                            1       18      4917       4343
## StreamingTV_Yes                            1       23      4916       4319
## StreamingMovies_Yes                        1       16      4915       4303
## `Contract_Month-to-month`                  1      102      4914       4201
## `Contract_One year`                        1       23      4913       4179
## PaperlessBilling_Yes                       1       15      4912       4164
## `PaymentMethod_Electronic check`           1       19      4911       4145
## `PaymentMethod_Mailed check`               1        2      4910       4143
## `PaymentMethod_Bank transfer (automatic)`  1        0      4909       4142
## `tenure_group_0 - 6 Mos`                   1      106      4908       4036
## `tenure_group_31 - 36 Mos`                 1        3      4907       4033
## `tenure_group_43 - 48 Mos`                 1        0      4906       4033
## `tenure_group_7 - 12 Mos`                  1       13      4905       4020
## `tenure_group_19 - 24 Mos`                 1        0      4904       4020
## `tenure_group_25 - 30 Mos`                 1        0      4903       4020
## `tenure_group_61 - 66 Mos`                 1        2      4902       4018
## `tenure_group_13 - 18 Mos`                 1        7      4901       4011
## `tenure_group_55 - 60 Mos`                 1        2      4900       4009
## `tenure_group_49 - 54 Mos`                 1        1      4899       4008
## `tenure_group_67 - 72 Mos`                 1        3      4898       4006
##                                           Pr(>Chi)    
## NULL                                                  
## gender                                     0.67211    
## TotalCharges                               < 2e-16 ***
## SeniorCitizen_Yes                          < 2e-16 ***
## Partner_Yes                                2.0e-08 ***
## Dependents_Yes                             1.8e-14 ***
## PhoneService_Yes                           0.01431 *  
## MultipleLines_Yes                          < 2e-16 ***
## InternetService_DSL                        1.2e-15 ***
## `InternetService_Fiber optic`              < 2e-16 ***
## OnlineSecurity_Yes                         4.4e-12 ***
## OnlineBackup_Yes                           0.00242 ** 
## DeviceProtection_Yes                       0.72241    
## TechSupport_Yes                            2.7e-05 ***
## StreamingTV_Yes                            1.3e-06 ***
## StreamingMovies_Yes                        5.9e-05 ***
## `Contract_Month-to-month`                  < 2e-16 ***
## `Contract_One year`                        2.0e-06 ***
## PaperlessBilling_Yes                       0.00012 ***
## `PaymentMethod_Electronic check`           1.3e-05 ***
## `PaymentMethod_Mailed check`               0.13387    
## `PaymentMethod_Bank transfer (automatic)`  0.60636    
## `tenure_group_0 - 6 Mos`                   < 2e-16 ***
## `tenure_group_31 - 36 Mos`                 0.08333 .  
## `tenure_group_43 - 48 Mos`                 0.92327    
## `tenure_group_7 - 12 Mos`                  0.00037 ***
## `tenure_group_19 - 24 Mos`                 0.52028    
## `tenure_group_25 - 30 Mos`                 0.70368    
## `tenure_group_61 - 66 Mos`                 0.14457    
## `tenure_group_13 - 18 Mos`                 0.00975 ** 
## `tenure_group_55 - 60 Mos`                 0.13323    
## `tenure_group_49 - 54 Mos`                 0.41911    
## `tenure_group_67 - 72 Mos`                 0.10720    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Removing insignificant features from the first table yields a simpler model, so we will build a second log regression analysis. In this model, we have incorporated results from the summary() and anova() commands following to streamline the analysis.

LogModel2 <- glm(Churn ~ TotalCharges + MultipleLines_Yes + InternetService_DSL + 
                         `InternetService_Fiber optic` + OnlineSecurity_Yes + 
                         TechSupport_Yes + StreamingTV_Yes + StreamingMovies_Yes + 
                         `Contract_Month-to-month` + `Contract_One year` + PaperlessBilling_Yes +
                         `PaymentMethod_Electronic check` + `tenure_group_0 - 6 Mos`,
                         family = binomial(link = 'logit'), 
                         data = training)
LogModel2
## 
## Call:  glm(formula = Churn ~ TotalCharges + MultipleLines_Yes + InternetService_DSL + 
##     `InternetService_Fiber optic` + OnlineSecurity_Yes + TechSupport_Yes + 
##     StreamingTV_Yes + StreamingMovies_Yes + `Contract_Month-to-month` + 
##     `Contract_One year` + PaperlessBilling_Yes + `PaymentMethod_Electronic check` + 
##     `tenure_group_0 - 6 Mos`, family = binomial(link = "logit"), 
##     data = training)
## 
## Coefficients:
##                      (Intercept)                      TotalCharges  
##                        -4.262754                         -0.000187  
##                MultipleLines_Yes               InternetService_DSL  
##                         0.289061                          0.931977  
##    `InternetService_Fiber optic`                OnlineSecurity_Yes  
##                         1.914219                         -0.444324  
##                  TechSupport_Yes                   StreamingTV_Yes  
##                        -0.286239                          0.295537  
##              StreamingMovies_Yes         `Contract_Month-to-month`  
##                         0.394641                          1.779448  
##              `Contract_One year`              PaperlessBilling_Yes  
##                         0.991664                          0.385197  
## `PaymentMethod_Electronic check`          `tenure_group_0 - 6 Mos`  
##                         0.362685                          1.050659  
## 
## Degrees of Freedom: 4930 Total (i.e. Null);  4917 Residual
## Null Deviance:       5710 
## Residual Deviance: 4080  AIC: 4110

Again, the summary confirms what we saw earlier regarding the most significant features.

summary(LogModel2)
## 
## Call:
## glm(formula = Churn ~ TotalCharges + MultipleLines_Yes + InternetService_DSL + 
##     `InternetService_Fiber optic` + OnlineSecurity_Yes + TechSupport_Yes + 
##     StreamingTV_Yes + StreamingMovies_Yes + `Contract_Month-to-month` + 
##     `Contract_One year` + PaperlessBilling_Yes + `PaymentMethod_Electronic check` + 
##     `tenure_group_0 - 6 Mos`, family = binomial(link = "logit"), 
##     data = training)
## 
## Deviance Residuals: 
##    Min      1Q  Median      3Q     Max  
## -2.124  -0.660  -0.288   0.621   2.993  
## 
## Coefficients:
##                                   Estimate Std. Error z value Pr(>|z|)    
## (Intercept)                      -4.26e+00   2.30e-01  -18.52  < 2e-16 ***
## TotalCharges                     -1.87e-04   3.15e-05   -5.92  3.1e-09 ***
## MultipleLines_Yes                 2.89e-01   9.33e-02    3.10   0.0019 ** 
## InternetService_DSL               9.32e-01   1.53e-01    6.08  1.2e-09 ***
## `InternetService_Fiber optic`     1.91e+00   1.60e-01   11.95  < 2e-16 ***
## OnlineSecurity_Yes               -4.44e-01   1.02e-01   -4.36  1.3e-05 ***
## TechSupport_Yes                  -2.86e-01   1.02e-01   -2.81   0.0050 ** 
## StreamingTV_Yes                   2.96e-01   9.71e-02    3.04   0.0023 ** 
## StreamingMovies_Yes               3.95e-01   9.64e-02    4.10  4.2e-05 ***
## `Contract_Month-to-month`         1.78e+00   2.04e-01    8.71  < 2e-16 ***
## `Contract_One year`               9.92e-01   2.06e-01    4.82  1.4e-06 ***
## PaperlessBilling_Yes              3.85e-01   8.92e-02    4.32  1.6e-05 ***
## `PaymentMethod_Electronic check`  3.63e-01   8.30e-02    4.37  1.2e-05 ***
## `tenure_group_0 - 6 Mos`          1.05e+00   1.05e-01   10.01  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 5707.1  on 4930  degrees of freedom
## Residual deviance: 4077.4  on 4917  degrees of freedom
## AIC: 4105
## 
## Number of Fisher Scoring iterations: 6

The ANOVA of the second log regression model is similar to the first.

anova(LogModel2, test = 'Chisq')
## Analysis of Deviance Table
## 
## Model: binomial, link: logit
## 
## Response: Churn
## 
## Terms added sequentially (first to last)
## 
## 
##                                  Df Deviance Resid. Df Resid. Dev Pr(>Chi)
## NULL                                              4930       5707         
## TotalCharges                      1      188      4929       5519  < 2e-16
## MultipleLines_Yes                 1      114      4928       5405  < 2e-16
## InternetService_DSL               1       62      4927       5344  4.0e-15
## `InternetService_Fiber optic`     1      861      4926       4482  < 2e-16
## OnlineSecurity_Yes                1       57      4925       4425  4.4e-14
## TechSupport_Yes                   1       24      4924       4401  8.3e-07
## StreamingTV_Yes                   1       24      4923       4376  7.7e-07
## StreamingMovies_Yes               1       19      4922       4358  1.4e-05
## `Contract_Month-to-month`         1      114      4921       4243  < 2e-16
## `Contract_One year`               1       23      4920       4220  1.3e-06
## PaperlessBilling_Yes              1       17      4919       4203  3.3e-05
## `PaymentMethod_Electronic check`  1       22      4918       4181  2.7e-06
## `tenure_group_0 - 6 Mos`          1      103      4917       4077  < 2e-16
##                                     
## NULL                                
## TotalCharges                     ***
## MultipleLines_Yes                ***
## InternetService_DSL              ***
## `InternetService_Fiber optic`    ***
## OnlineSecurity_Yes               ***
## TechSupport_Yes                  ***
## StreamingTV_Yes                  ***
## StreamingMovies_Yes              ***
## `Contract_Month-to-month`        ***
## `Contract_One year`              ***
## PaperlessBilling_Yes             ***
## `PaymentMethod_Electronic check` ***
## `tenure_group_0 - 6 Mos`         ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

We can now develop predictions about whether a customer is likely to churn.

glm.fit <- predict.glm(LogModel, newdata = testing, type = 'response')
glm.fit2 <- predict.glm(LogModel2, newdata = testing, type = 'response')
glm.fit <- round(glm.fit, 0)
glm.fit <- as.character(glm.fit)
glm.fit2 <- round(glm.fit2, 0)
glm.fit2 <- as.character(glm.fit2)

An accuracy test for the first log regression model shows it is 82% accurate. This is a 10% improvement over guessing.

testing$Churn <- as.character(testing$Churn)
misClassErr <- mean(glm.fit != testing$Churn)
misClassErr2 <- mean(glm.fit2 != testing$Churn)
print('Accuracy Rate LogMod'); 1 - misClassErr
## [1] "Accuracy Rate LogMod"
## [1] 0.7973

We get the same results for the accuracy of the second model.

print('Accuracy Rate LogMod2'); 1 - misClassErr2
## [1] "Accuracy Rate LogMod2"
## [1] 0.7992

As can be seen by the confusion tables that follow, the models appear to be better at predicting which customers are least likely to churn than they are at identifying customers who are most likely to churn. False positives make up 30% of positive predictions, whereas false negatives make up 15% of negative predictions.

print("Confusion Matrix for Log Regression Model 1"); table(testing$Churn, glm.fit > 0.5)
## [1] "Confusion Matrix for Log Regression Model 1"
##    
##     FALSE TRUE
##   0  1389  163
##   1   265  295
print("Confusion Matrix for Log Regression Model 2"); table(testing$Churn, glm.fit2 > 0.5)
## [1] "Confusion Matrix for Log Regression Model 2"
##    
##     FALSE TRUE
##   0  1401  151
##   1   273  287

5 Building A Decision Tree Analysis

Whether a customer churns or not is a binary decision – “Yes” or “No”. Decision trees are made for this kind of problem. In this first pass, just four factors are considered significant for predictive purposes. They are identified in the output that follows. We see the error rate is roughly 21%, on par with the error rates from the log regression models.

NoChurn <- ifelse(ChurnDataDum$Churn == 0, 'No', 'Yes')
ChurnDataDum <- data.frame(NoChurn, ChurnDataDum)
DT.Churn <- tree(NoChurn ~. -Churn, data = ChurnDataDum)
summary(DT.Churn)
## 
## Classification tree:
## tree(formula = NoChurn ~ . - Churn, data = ChurnDataDum)
## Variables actually used in tree construction:
## [1] "Contract_Month.to.month"     "InternetService_Fiber.optic"
## [3] "tenure_group_0...6.Mos"      "TotalCharges"               
## Number of terminal nodes:  6 
## Residual mean deviance:  0.881 = 6200 / 7040 
## Misclassification error rate: 0.209 = 1470 / 7043

A visual of the tree contributes to understanding the importance of the four features and how they rank with each other. We can see this tree consists of six terminal nodes, only one of which results in the customer’s decision to leave service. In this analysis, a customer on a month-to-month contract, with a fiber optic connection, who has paid less than $1556 in total charges, is a likely candidate to churn.

plot(DT.Churn); text(DT.Churn)

Using the decision tree model just developed, we can generate specific predictions. Again, decision tree accuracy is about 78%, a 6% improvement over guessing.

set.seed(805)
ChurnDataDum <- ChurnDataDum[-7043, ]
DT.train <- sample(1:nrow(ChurnDataDum), 3521)
DT.test <- ChurnDataDum[-DT.train, ]
DT.Churn2 <- tree(NoChurn ~ . -Churn, data = ChurnDataDum, subset = DT.train)
DT.pred <- predict(DT.Churn2, DT.test, type = 'class')
confusionMatrix(DT.pred, DT.test$NoChurn)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   No  Yes
##        No  2411  627
##        Yes  152  331
##                                         
##                Accuracy : 0.779         
##                  95% CI : (0.765, 0.792)
##     No Information Rate : 0.728         
##     P-Value [Acc > NIR] : 2.55e-12      
##                                         
##                   Kappa : 0.339         
##  Mcnemar's Test P-Value : < 2e-16       
##                                         
##             Sensitivity : 0.941         
##             Specificity : 0.346         
##          Pos Pred Value : 0.794         
##          Neg Pred Value : 0.685         
##              Prevalence : 0.728         
##          Detection Rate : 0.685         
##    Detection Prevalence : 0.863         
##       Balanced Accuracy : 0.643         
##                                         
##        'Positive' Class : No            
## 

It is possible to further simplify the decision tree model by pruning back branches that don’t contribute to model accuracy.

set.seed(805) 
cv.Churn <- cv.tree(DT.Churn, FUN = prune.misclass)
names(cv.Churn)
## [1] "size"   "dev"    "k"      "method"
cv.Churn
## $size
## [1] 6 4 1
## 
## $dev
## [1] 1501 1501 1869
## 
## $k
## [1] -Inf    0  133
## 
## $method
## [1] "misclass"
## 
## attr(,"class")
## [1] "prune"         "tree.sequence"

It appears a tree with four terminal nodes is just as accurate as a tree with six nodes. Again, only one node results in a customer churning, which means 75% of customers (3 of 4) do not churn in the model. That is on par with guessing.

prune.Churn <- prune.misclass(DT.Churn, best = 4)
plot(prune.Churn); text(prune.Churn)

The four-node tree is just as accurate as the six-node tree, with an accuracy rate of 78%.

prune.pred <- predict(prune.Churn, DT.test, type = 'class')
confusionMatrix(prune.pred, DT.test$NoChurn)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   No  Yes
##        No  2387  594
##        Yes  176  364
##                                         
##                Accuracy : 0.781         
##                  95% CI : (0.767, 0.795)
##     No Information Rate : 0.728         
##     P-Value [Acc > NIR] : 1.94e-13      
##                                         
##                   Kappa : 0.361         
##  Mcnemar's Test P-Value : < 2e-16       
##                                         
##             Sensitivity : 0.931         
##             Specificity : 0.380         
##          Pos Pred Value : 0.801         
##          Neg Pred Value : 0.674         
##              Prevalence : 0.728         
##          Detection Rate : 0.678         
##    Detection Prevalence : 0.847         
##       Balanced Accuracy : 0.656         
##                                         
##        'Positive' Class : No            
## 

6 Building a Random Forest Solution

An ensemble method, such as random forest, builds out a large number of decision tree models and averages their results. At 79%, the accuracy rate of this approach is similar to the log regression and decision tree algorithms. This algorithm offers a 7% improvement over guessing. The random forest is a slight improvement over the decision tree, in this analysis.

set.seed(805)
rf.Churn <- randomForest(NoChurn ~ . -Churn, data = ChurnDataDum, subset = DT.train,
                          mtry = 6, importance = TRUE)
pred.rf <- predict(rf.Churn, newdata = DT.test)
confusionMatrix(pred.rf, DT.test$NoChurn)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   No  Yes
##        No  2357  543
##        Yes  206  415
##                                         
##                Accuracy : 0.787         
##                  95% CI : (0.773, 0.801)
##     No Information Rate : 0.728         
##     P-Value [Acc > NIR] : 2.83e-16      
##                                         
##                   Kappa : 0.396         
##  Mcnemar's Test P-Value : < 2e-16       
##                                         
##             Sensitivity : 0.920         
##             Specificity : 0.433         
##          Pos Pred Value : 0.813         
##          Neg Pred Value : 0.668         
##              Prevalence : 0.728         
##          Detection Rate : 0.669         
##    Detection Prevalence : 0.824         
##       Balanced Accuracy : 0.676         
##                                         
##        'Positive' Class : No            
## 

Visualizing these results is helpful. Again, we see the models are better at identifying a customer unlikely to leave better than they are at identifying a customer who is likely to leave.

plot(pred.rf, DT.test$NoChurn, main = 'Random Forest Results',
     xlab = 'Predicted Value', ylab = 'Actual Value')

Finally, we can see which features contribute most to a customer’s decision to churn. In tabular form, they are unranked.

importance(rf.Churn)
##                                               No     Yes
## gender                                   -1.4240 -0.6984
## TotalCharges                             22.3778 21.2693
## SeniorCitizen_Yes                         9.7989 -3.2163
## Partner_Yes                               4.3332  1.8516
## Dependents_Yes                            1.2990 10.2147
## PhoneService_Yes                         -8.9566 19.9595
## MultipleLines_Yes                         3.4685  7.9834
## InternetService_DSL                     -10.0015 20.9954
## InternetService_Fiber.optic              15.6548 47.8150
## OnlineSecurity_Yes                        6.0971 26.8004
## OnlineBackup_Yes                         10.9881 -1.6273
## DeviceProtection_Yes                     16.7526 -8.9533
## TechSupport_Yes                          11.5305 14.0583
## StreamingTV_Yes                          10.5591  5.5190
## StreamingMovies_Yes                       9.2498  2.4459
## Contract_Month.to.month                   0.4334 31.5580
## Contract_One.year                       -15.7257 14.3068
## PaperlessBilling_Yes                     -0.7857 22.1208
## PaymentMethod_Electronic.check            2.2306 21.3031
## PaymentMethod_Mailed.check                2.8051 -2.4988
## PaymentMethod_Bank.transfer..automatic.  -1.2166  1.3519
## tenure_group_0...6.Mos                   15.2599 16.6315
## tenure_group_31...36.Mos                 -1.6638  0.6426
## tenure_group_43...48.Mos                 -2.8084  2.4723
## tenure_group_7...12.Mos                   3.7959 -1.1609
## tenure_group_19...24.Mos                 -0.2397  4.0860
## tenure_group_25...30.Mos                  0.3202  5.4111
## tenure_group_61...66.Mos                  5.5270  7.2554
## tenure_group_13...18.Mos                 -4.2426  6.9804
## tenure_group_55...60.Mos                  1.4893  9.1902
## tenure_group_49...54.Mos                 -6.9619  6.3927
## tenure_group_67...72.Mos                  2.3431  9.8952
##                                         MeanDecreaseAccuracy
## gender                                               -1.5700
## TotalCharges                                         35.9973
## SeniorCitizen_Yes                                     6.2353
## Partner_Yes                                           5.1635
## Dependents_Yes                                        8.6654
## PhoneService_Yes                                      4.8953
## MultipleLines_Yes                                     8.5141
## InternetService_DSL                                  10.8236
## InternetService_Fiber.optic                          44.8226
## OnlineSecurity_Yes                                   24.3397
## OnlineBackup_Yes                                      9.3206
## DeviceProtection_Yes                                 11.4520
## TechSupport_Yes                                      17.4874
## StreamingTV_Yes                                      13.5861
## StreamingMovies_Yes                                  10.3679
## Contract_Month.to.month                              35.6396
## Contract_One.year                                    -1.5028
## PaperlessBilling_Yes                                 14.7033
## PaymentMethod_Electronic.check                       17.5580
## PaymentMethod_Mailed.check                            1.2028
## PaymentMethod_Bank.transfer..automatic.              -0.3221
## tenure_group_0...6.Mos                               26.8217
## tenure_group_31...36.Mos                             -1.1185
## tenure_group_43...48.Mos                             -0.7022
## tenure_group_7...12.Mos                               2.8364
## tenure_group_19...24.Mos                              2.7050
## tenure_group_25...30.Mos                              3.6035
## tenure_group_61...66.Mos                              9.8302
## tenure_group_13...18.Mos                              1.6585
## tenure_group_55...60.Mos                              6.9838
## tenure_group_49...54.Mos                             -2.6947
## tenure_group_67...72.Mos                             10.5005
##                                         MeanDecreaseGini
## gender                                            30.394
## TotalCharges                                     227.613
## SeniorCitizen_Yes                                 29.927
## Partner_Yes                                       29.668
## Dependents_Yes                                    27.690
## PhoneService_Yes                                  15.344
## MultipleLines_Yes                                 30.067
## InternetService_DSL                               19.731
## InternetService_Fiber.optic                       80.697
## OnlineSecurity_Yes                                35.915
## OnlineBackup_Yes                                  30.307
## DeviceProtection_Yes                              29.019
## TechSupport_Yes                                   32.638
## StreamingTV_Yes                                   32.089
## StreamingMovies_Yes                               30.977
## Contract_Month.to.month                          106.655
## Contract_One.year                                 14.459
## PaperlessBilling_Yes                              37.695
## PaymentMethod_Electronic.check                    57.663
## PaymentMethod_Mailed.check                        16.087
## PaymentMethod_Bank.transfer..automatic.           17.029
## tenure_group_0...6.Mos                            53.066
## tenure_group_31...36.Mos                          11.315
## tenure_group_43...48.Mos                          10.202
## tenure_group_7...12.Mos                           12.130
## tenure_group_19...24.Mos                          12.882
## tenure_group_25...30.Mos                          10.041
## tenure_group_61...66.Mos                           7.765
## tenure_group_13...18.Mos                          13.310
## tenure_group_55...60.Mos                          10.231
## tenure_group_49...54.Mos                           9.982
## tenure_group_67...72.Mos                          11.519

When we rank them in graphic form, we find the top four features in the left hand chart (below) match those from the IBM analysis.

varImpPlot(rf.Churn, n.var = 10)