First, load the R packages to work with in this analysis.
library(tidyverse)
library(magrittr)
library(ggplot2)
library(plyr)
library(gridExtra)
library(ggthemes)
library(caret)
library(MASS)
library(randomForest)
library(party)
library(corrplot)
library(fastDummies)
library(tree)
library(randomForest)
library(gbm)
Routine data cleaning and structuring in preparation for the analysis and model building.
ChurnURL <- 'https://community.watsonanalytics.com/wp-content/uploads/2015/03/WA_Fn-UseC_-Telco-Customer-Churn.csv'
ChurnData <- read.csv(url(ChurnURL), sep = ",", header = TRUE, stringsAsFactors = FALSE)
ChurnData <- as.tibble(ChurnData)
str(ChurnData)
## Classes 'tbl_df', 'tbl' and 'data.frame': 7043 obs. of 21 variables:
## $ customerID : chr "7590-VHVEG" "5575-GNVDE" "3668-QPYBK" "7795-CFOCW" ...
## $ gender : chr "Female" "Male" "Male" "Male" ...
## $ SeniorCitizen : int 0 0 0 0 0 0 0 0 0 0 ...
## $ Partner : chr "Yes" "No" "No" "No" ...
## $ Dependents : chr "No" "No" "No" "No" ...
## $ tenure : int 1 34 2 45 2 8 22 10 28 62 ...
## $ PhoneService : chr "No" "Yes" "Yes" "No" ...
## $ MultipleLines : chr "No phone service" "No" "No" "No phone service" ...
## $ InternetService : chr "DSL" "DSL" "DSL" "DSL" ...
## $ OnlineSecurity : chr "No" "Yes" "Yes" "Yes" ...
## $ OnlineBackup : chr "Yes" "No" "Yes" "No" ...
## $ DeviceProtection: chr "No" "Yes" "No" "Yes" ...
## $ TechSupport : chr "No" "No" "No" "Yes" ...
## $ StreamingTV : chr "No" "No" "No" "No" ...
## $ StreamingMovies : chr "No" "No" "No" "No" ...
## $ Contract : chr "Month-to-month" "One year" "Month-to-month" "One year" ...
## $ PaperlessBilling: chr "Yes" "No" "Yes" "No" ...
## $ PaymentMethod : chr "Electronic check" "Mailed check" "Mailed check" "Bank transfer (automatic)" ...
## $ MonthlyCharges : num 29.9 57 53.9 42.3 70.7 ...
## $ TotalCharges : num 29.9 1889.5 108.2 1840.8 151.7 ...
## $ Churn : chr "No" "No" "Yes" "No" ...
sapply(ChurnData, function(x) sum(is.na(x)))
## customerID gender SeniorCitizen Partner
## 0 0 0 0
## Dependents tenure PhoneService MultipleLines
## 0 0 0 0
## InternetService OnlineSecurity OnlineBackup DeviceProtection
## 0 0 0 0
## TechSupport StreamingTV StreamingMovies Contract
## 0 0 0 0
## PaperlessBilling PaymentMethod MonthlyCharges TotalCharges
## 0 0 0 11
## Churn
## 0
ChurnData$TotalCharges[is.na(ChurnData$TotalCharges)] <- 0
sum(is.na(ChurnData$TotalCharges))
## [1] 0
ChurnData$gender <- as.factor(mapvalues(ChurnData$gender,
from = c('Female', 'Male'), to = c(-1, 1)))
ChurnData$SeniorCitizen <- as.factor(mapvalues(ChurnData$SeniorCitizen,
from = c('0', '1'), to = c('No', 'Yes')))
ChurnData$MultipleLines <- as.factor(mapvalues(ChurnData$MultipleLines,
from = 'No phone service', to = 'No'))
ChurnData$OnlineSecurity <- as.factor(mapvalues(ChurnData$OnlineSecurity,
from = 'No internet service', to = 'No'))
ChurnData$OnlineBackup <- as.factor(mapvalues(ChurnData$OnlineBackup,
from = 'No internet service', to = 'No'))
ChurnData$DeviceProtection <- as.factor(mapvalues(ChurnData$DeviceProtection,
from = 'No internet service', to = 'No'))
ChurnData$TechSupport <- as.factor(mapvalues(ChurnData$TechSupport,
from = 'No internet service', to = 'No'))
ChurnData$StreamingTV <- as.factor(mapvalues(ChurnData$StreamingTV,
from = 'No internet service', to = 'No'))
ChurnData$StreamingMovies <- as.factor(mapvalues(ChurnData$StreamingMovies,
from = 'No internet service', to = 'No'))
ChurnData$Churn <- as.factor(mapvalues(ChurnData$Churn,
from = c('No', 'Yes'), to = c('0', '1')))
min(ChurnData$tenure); max(ChurnData$tenure)
## [1] 0
## [1] 72
ChurnData$tenure_group <- cut(ChurnData$tenure, breaks = c(-1, 6, 12, 18, 24, 30,
36, 42, 48, 54, 60, 66, 72),
labels = c('0 - 6 Mos', '7 - 12 Mos', '13 - 18 Mos',
'19 - 24 Mos','25 - 30 Mos', '31 - 36 Mos',
'37 - 42 Mos', '43 - 48 Mos','49 - 54 Mos',
'55 - 60 Mos','61 - 66 Mos', '67 - 72 Mos'))
ChurnData$customerID <- NULL; ChurnData$tenure <- NULL
Check for any significant correlation levels between features.
numeric <- sapply(ChurnData, is.numeric)
CorrTable <- cor(ChurnData[, numeric])
CorrTable
## MonthlyCharges TotalCharges
## MonthlyCharges 1.0000 0.6512
## TotalCharges 0.6512 1.0000
ChurnData$MonthlyCharges <- NULL
MonthlyCharges and TotalCharges are highly correlated so we remove one feature – Monthly Charges – before modelling the data.
Plot features that might help with understanding churn. Note: in the Gender plot, -1 designates Female.
These features represent characteristics of service subscribers.
p1 <- ggplot(ChurnData, aes(x = gender)) + geom_bar(aes( y = 100 * (..count..) /
sum(..count..)), width = 0.5) + ggtitle('Gender') + xlab('Gender') +
ylab('Percentage') + coord_flip() + theme_economist()
p2 <- ggplot(ChurnData, aes(x = SeniorCitizen)) + geom_bar(aes( y = 100 * (..count..) /
sum(..count..)), width = 0.5) + ggtitle('Senior Citizen') +
xlab('Senior Citizen') + ylab('Percentage') + coord_flip() + theme_economist()
p3 <- ggplot(ChurnData, aes(x = Partner)) + geom_bar(aes( y = 100 * (..count..) /
sum(..count..)), width = 0.5) + ggtitle('Has a Partner') +
xlab('Partner') + ylab('Percentage') + coord_flip() + theme_economist()
p4 <- ggplot(ChurnData, aes(x = Dependents)) + geom_bar(aes( y = 100 * (..count..) /
sum(..count..)), width = 0.5) + ggtitle('Has Dependents') +
xlab('Dependents') + ylab('Percentage') + coord_flip() + theme_economist()
grid.arrange(p1, p2, p3, p4, ncol = 2)
p5 <- ggplot(ChurnData, aes(x = PhoneService)) + geom_bar(aes( y = 100 * (..count..) /
sum(..count..)), width = 0.5) + ggtitle('Phone Service') + xlab('Phone Service') +
ylab('Percentage') + coord_flip() + theme_economist()
p6 <- ggplot(ChurnData, aes(x = MultipleLines)) + geom_bar(aes( y = 100 * (..count..) /
sum(..count..)), width = 0.5) + ggtitle('Multiple Lines') +
xlab('Multiple Lines') + ylab('Percentage') + coord_flip() + theme_economist()
p7 <- ggplot(ChurnData, aes(x = InternetService)) + geom_bar(aes( y = 100 * (..count..) /
sum(..count..)), width = 0.5) + ggtitle('Internet Service') +
xlab('Internet Service') + ylab('Percentage') + coord_flip() + theme_economist()
p8 <- ggplot(ChurnData, aes(x = OnlineSecurity)) + geom_bar(aes( y = 100 * (..count..) /
sum(..count..)), width = 0.5) + ggtitle('Online Security') +
xlab('Online Security') + ylab('Percentage') + coord_flip() + theme_economist()
grid.arrange(p5, p6, p7, p8, ncol = 2)
p9 <- ggplot(ChurnData, aes(x = OnlineBackup)) + geom_bar(aes( y = 100 * (..count..) /
sum(..count..)), width = 0.5) + ggtitle('Online Backup') + xlab('Online Backup') +
ylab('Percentage') + coord_flip() + theme_economist()
p10 <- ggplot(ChurnData, aes(x = DeviceProtection)) + geom_bar(aes( y = 100 * (..count..) /
sum(..count..)), width = 0.5) + ggtitle('Device Protection') +
xlab('Device Protection') + ylab('Percentage') + coord_flip() + theme_economist()
p11 <- ggplot(ChurnData, aes(x = TechSupport)) + geom_bar(aes( y = 100 * (..count..) /
sum(..count..)), width = 0.5) + ggtitle('Tech Support') +
xlab('Tech Support') + ylab('Percentage') + coord_flip() + theme_economist()
p12 <- ggplot(ChurnData, aes(x = StreamingTV)) + geom_bar(aes( y = 100 * (..count..) /
sum(..count..)), width = 0.5) + ggtitle('Streaming TV') +
xlab('Streaming TV') + ylab('Percentage') + coord_flip() + theme_economist()
grid.arrange(p9, p10, p11, p12, ncol = 2)
p13 <- ggplot(ChurnData, aes(x = StreamingMovies)) + geom_bar(aes( y = 100 * (..count..) /
sum(..count..)), width = 0.5) + ggtitle('Streaming Movies') + xlab('StreamingMovies') +
ylab('Percentage') + coord_flip() + theme_economist()
p14 <- ggplot(ChurnData, aes(x = Contract)) + geom_bar(aes( y = 100 * (..count..) /
sum(..count..)), width = 0.5) + ggtitle('Contract Type') +
xlab('Contract') + ylab('Percentage') + coord_flip() + theme_economist()
p15 <- ggplot(ChurnData, aes(x = PaperlessBilling)) + geom_bar(aes( y = 100 * (..count..) /
sum(..count..)), width = 0.5) + ggtitle('Paperless Billing') +
xlab('Paperless Billing') + ylab('Percentage') + coord_flip() + theme_economist()
p16 <- ggplot(ChurnData, aes(x = PaymentMethod)) + geom_bar(aes( y = 100 * (..count..) /
sum(..count..)), width = 0.5) + ggtitle('Payment') +
xlab('Payment Method') + ylab('Percentage') + coord_flip() + theme_economist()
grid.arrange(p13, p14, p15, p16, ncol = 2)
Notice, over half of all customers are on a monthly contract.
The length of time a customer has been with the service may be important to churn. Notice the largest group of customers are the newest, with less than six months service. Half of them are lost by the end of the first year.
p17 <- ggplot(ChurnData, aes(x = tenure_group)) + geom_bar(aes( y = 100 * (..count..) /
sum(..count..)), width = 0.5) + ggtitle('Tenure Groups') +
xlab('Tenure Groups') + ylab('Percentage') + coord_flip() + theme_economist()
p17
Historically, 26% of customers in this data set churn. If we were to guess all customers stay, we would be right 74% of the time. That is, a naive approach of guessing the same result in all instances would have an accuracy rate of 74%. What we want to see in this study is if modelling the problem provides any improvement. In other words, does analysis and modelling add information?
p18 <- ggplot(ChurnData, aes(x = Churn)) + geom_bar(aes( y = 100 * (..count..) /
sum(..count..)), width = 0.5) + ggtitle('Customer Churn') +
xlab('Churn') + ylab('Percentage') + coord_flip() + theme_economist()
p18
More data preparation for modelling. Here we create dummy variables and remove redundant features.
ChurnDataDum <- dummy_cols(ChurnData)
ChurnTenure <- ChurnDataDum[, 19]
Churn <- ChurnDataDum[, 18]
ChurnDataDum <- ChurnDataDum[, -c(2:16, 18:22, 25:26, 28, 30, 34:35, 38:39, 41, 43, 45, 49, 51, 55:57, 69)]
Create training and testing subsets of the full data set. The data are split 70% for training and 30% for testing.
ChurnDataDum <- cbind(Churn, ChurnDataDum)
intrain <- createDataPartition(ChurnDataDum$Churn, p = 0.7, list = FALSE)
set.seed(2017)
training <- ChurnDataDum[intrain, ]
testing <- ChurnDataDum[-intrain, ]
dim(training); dim(testing)
## [1] 4931 33
## [1] 2112 33
Build the first log regression model.
LogModel <- glm(Churn ~. , family = binomial(link = 'logit'), data = training)
LogModel
##
## Call: glm(formula = Churn ~ ., family = binomial(link = "logit"), data = training)
##
## Coefficients:
## (Intercept)
## -4.337425
## gender1
## 0.005377
## TotalCharges
## 0.000102
## SeniorCitizen_Yes
## 0.394330
## Partner_Yes
## 0.112211
## Dependents_Yes
## -0.191822
## PhoneService_Yes
## -0.624438
## MultipleLines_Yes
## 0.308709
## InternetService_DSL
## 0.741352
## `InternetService_Fiber optic`
## 1.682586
## OnlineSecurity_Yes
## -0.453168
## OnlineBackup_Yes
## -0.322863
## DeviceProtection_Yes
## -0.011970
## TechSupport_Yes
## -0.286618
## StreamingTV_Yes
## 0.229010
## StreamingMovies_Yes
## 0.283132
## `Contract_Month-to-month`
## 1.688480
## `Contract_One year`
## 0.992836
## PaperlessBilling_Yes
## 0.371091
## `PaymentMethod_Electronic check`
## 0.370579
## `PaymentMethod_Mailed check`
## 0.011406
## `PaymentMethod_Bank transfer (automatic)`
## 0.071872
## `tenure_group_0 - 6 Mos`
## 1.920175
## `tenure_group_31 - 36 Mos`
## -0.020716
## `tenure_group_43 - 48 Mos`
## -0.040279
## `tenure_group_7 - 12 Mos`
## 1.079799
## `tenure_group_19 - 24 Mos`
## 0.299911
## `tenure_group_25 - 30 Mos`
## 0.253399
## `tenure_group_61 - 66 Mos`
## -0.783714
## `tenure_group_13 - 18 Mos`
## 0.707674
## `tenure_group_55 - 60 Mos`
## -0.572715
## `tenure_group_49 - 54 Mos`
## -0.070953
## `tenure_group_67 - 72 Mos`
## -0.595183
##
## Degrees of Freedom: 4930 Total (i.e. Null); 4898 Residual
## Null Deviance: 5710
## Residual Deviance: 4010 AIC: 4070
A summary of the first model indicates factors that are most important to understanding churn. Those below marked with asterisks are statistically significant. But we will not need so many to understand churn as we proceed with the analysis.
summary(LogModel)
##
## Call:
## glm(formula = Churn ~ ., family = binomial(link = "logit"), data = training)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.200 -0.682 -0.279 0.608 3.290
##
## Coefficients:
## Estimate Std. Error z value
## (Intercept) -4.34e+00 3.76e-01 -11.54
## gender1 5.38e-03 7.86e-02 0.07
## TotalCharges 1.02e-04 8.46e-05 1.20
## SeniorCitizen_Yes 3.94e-01 1.01e-01 3.89
## Partner_Yes 1.12e-01 9.52e-02 1.18
## Dependents_Yes -1.92e-01 1.10e-01 -1.75
## PhoneService_Yes -6.24e-01 1.63e-01 -3.82
## MultipleLines_Yes 3.09e-01 9.80e-02 3.15
## InternetService_DSL 7.41e-01 1.66e-01 4.45
## `InternetService_Fiber optic` 1.68e+00 1.72e-01 9.75
## OnlineSecurity_Yes -4.53e-01 1.04e-01 -4.37
## OnlineBackup_Yes -3.23e-01 9.47e-02 -3.41
## DeviceProtection_Yes -1.20e-02 9.71e-02 -0.12
## TechSupport_Yes -2.87e-01 1.05e-01 -2.74
## StreamingTV_Yes 2.29e-01 1.01e-01 2.27
## StreamingMovies_Yes 2.83e-01 1.00e-01 2.82
## `Contract_Month-to-month` 1.69e+00 2.26e-01 7.47
## `Contract_One year` 9.93e-01 2.21e-01 4.48
## PaperlessBilling_Yes 3.71e-01 9.03e-02 4.11
## `PaymentMethod_Electronic check` 3.71e-01 1.17e-01 3.18
## `PaymentMethod_Mailed check` 1.14e-02 1.41e-01 0.08
## `PaymentMethod_Bank transfer (automatic)` 7.19e-02 1.37e-01 0.52
## `tenure_group_0 - 6 Mos` 1.92e+00 2.94e-01 6.53
## `tenure_group_31 - 36 Mos` -2.07e-02 2.48e-01 -0.08
## `tenure_group_43 - 48 Mos` -4.03e-02 2.61e-01 -0.15
## `tenure_group_7 - 12 Mos` 1.08e+00 2.87e-01 3.76
## `tenure_group_19 - 24 Mos` 3.00e-01 2.65e-01 1.13
## `tenure_group_25 - 30 Mos` 2.53e-01 2.54e-01 1.00
## `tenure_group_61 - 66 Mos` -7.84e-01 3.48e-01 -2.25
## `tenure_group_13 - 18 Mos` 7.08e-01 2.75e-01 2.57
## `tenure_group_55 - 60 Mos` -5.73e-01 3.01e-01 -1.90
## `tenure_group_49 - 54 Mos` -7.10e-02 2.66e-01 -0.27
## `tenure_group_67 - 72 Mos` -5.95e-01 3.74e-01 -1.59
## Pr(>|z|)
## (Intercept) < 2e-16 ***
## gender1 0.94545
## TotalCharges 0.22987
## SeniorCitizen_Yes 0.00010 ***
## Partner_Yes 0.23842
## Dependents_Yes 0.08000 .
## PhoneService_Yes 0.00013 ***
## MultipleLines_Yes 0.00162 **
## InternetService_DSL 8.5e-06 ***
## `InternetService_Fiber optic` < 2e-16 ***
## OnlineSecurity_Yes 1.2e-05 ***
## OnlineBackup_Yes 0.00066 ***
## DeviceProtection_Yes 0.90190
## TechSupport_Yes 0.00618 **
## StreamingTV_Yes 0.02320 *
## StreamingMovies_Yes 0.00479 **
## `Contract_Month-to-month` 8.3e-14 ***
## `Contract_One year` 7.3e-06 ***
## PaperlessBilling_Yes 4.0e-05 ***
## `PaymentMethod_Electronic check` 0.00148 **
## `PaymentMethod_Mailed check` 0.93535
## `PaymentMethod_Bank transfer (automatic)` 0.60077
## `tenure_group_0 - 6 Mos` 6.4e-11 ***
## `tenure_group_31 - 36 Mos` 0.93340
## `tenure_group_43 - 48 Mos` 0.87750
## `tenure_group_7 - 12 Mos` 0.00017 ***
## `tenure_group_19 - 24 Mos` 0.25759
## `tenure_group_25 - 30 Mos` 0.31792
## `tenure_group_61 - 66 Mos` 0.02441 *
## `tenure_group_13 - 18 Mos` 0.01014 *
## `tenure_group_55 - 60 Mos` 0.05696 .
## `tenure_group_49 - 54 Mos` 0.78999
## `tenure_group_67 - 72 Mos` 0.11150
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 5707.1 on 4930 degrees of freedom
## Residual deviance: 4005.5 on 4898 degrees of freedom
## AIC: 4072
##
## Number of Fisher Scoring iterations: 6
The analysis of variance (ANOVA) gives similar results regarding the importance of specific factors.
anova(LogModel, test = 'Chisq')
## Analysis of Deviance Table
##
## Model: binomial, link: logit
##
## Response: Churn
##
## Terms added sequentially (first to last)
##
##
## Df Deviance Resid. Df Resid. Dev
## NULL 4930 5707
## gender 1 0 4929 5707
## TotalCharges 1 188 4928 5519
## SeniorCitizen_Yes 1 199 4927 5320
## Partner_Yes 1 31 4926 5288
## Dependents_Yes 1 59 4925 5229
## PhoneService_Yes 1 6 4924 5223
## MultipleLines_Yes 1 74 4923 5149
## InternetService_DSL 1 64 4922 5085
## `InternetService_Fiber optic` 1 668 4921 4417
## OnlineSecurity_Yes 1 48 4920 4370
## OnlineBackup_Yes 1 9 4919 4360
## DeviceProtection_Yes 1 0 4918 4360
## TechSupport_Yes 1 18 4917 4343
## StreamingTV_Yes 1 23 4916 4319
## StreamingMovies_Yes 1 16 4915 4303
## `Contract_Month-to-month` 1 102 4914 4201
## `Contract_One year` 1 23 4913 4179
## PaperlessBilling_Yes 1 15 4912 4164
## `PaymentMethod_Electronic check` 1 19 4911 4145
## `PaymentMethod_Mailed check` 1 2 4910 4143
## `PaymentMethod_Bank transfer (automatic)` 1 0 4909 4142
## `tenure_group_0 - 6 Mos` 1 106 4908 4036
## `tenure_group_31 - 36 Mos` 1 3 4907 4033
## `tenure_group_43 - 48 Mos` 1 0 4906 4033
## `tenure_group_7 - 12 Mos` 1 13 4905 4020
## `tenure_group_19 - 24 Mos` 1 0 4904 4020
## `tenure_group_25 - 30 Mos` 1 0 4903 4020
## `tenure_group_61 - 66 Mos` 1 2 4902 4018
## `tenure_group_13 - 18 Mos` 1 7 4901 4011
## `tenure_group_55 - 60 Mos` 1 2 4900 4009
## `tenure_group_49 - 54 Mos` 1 1 4899 4008
## `tenure_group_67 - 72 Mos` 1 3 4898 4006
## Pr(>Chi)
## NULL
## gender 0.67211
## TotalCharges < 2e-16 ***
## SeniorCitizen_Yes < 2e-16 ***
## Partner_Yes 2.0e-08 ***
## Dependents_Yes 1.8e-14 ***
## PhoneService_Yes 0.01431 *
## MultipleLines_Yes < 2e-16 ***
## InternetService_DSL 1.2e-15 ***
## `InternetService_Fiber optic` < 2e-16 ***
## OnlineSecurity_Yes 4.4e-12 ***
## OnlineBackup_Yes 0.00242 **
## DeviceProtection_Yes 0.72241
## TechSupport_Yes 2.7e-05 ***
## StreamingTV_Yes 1.3e-06 ***
## StreamingMovies_Yes 5.9e-05 ***
## `Contract_Month-to-month` < 2e-16 ***
## `Contract_One year` 2.0e-06 ***
## PaperlessBilling_Yes 0.00012 ***
## `PaymentMethod_Electronic check` 1.3e-05 ***
## `PaymentMethod_Mailed check` 0.13387
## `PaymentMethod_Bank transfer (automatic)` 0.60636
## `tenure_group_0 - 6 Mos` < 2e-16 ***
## `tenure_group_31 - 36 Mos` 0.08333 .
## `tenure_group_43 - 48 Mos` 0.92327
## `tenure_group_7 - 12 Mos` 0.00037 ***
## `tenure_group_19 - 24 Mos` 0.52028
## `tenure_group_25 - 30 Mos` 0.70368
## `tenure_group_61 - 66 Mos` 0.14457
## `tenure_group_13 - 18 Mos` 0.00975 **
## `tenure_group_55 - 60 Mos` 0.13323
## `tenure_group_49 - 54 Mos` 0.41911
## `tenure_group_67 - 72 Mos` 0.10720
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Removing insignificant features from the first table yields a simpler model, so we will build a second log regression analysis. In this model, we have incorporated results from the summary() and anova() commands following to streamline the analysis.
LogModel2 <- glm(Churn ~ TotalCharges + MultipleLines_Yes + InternetService_DSL +
`InternetService_Fiber optic` + OnlineSecurity_Yes +
TechSupport_Yes + StreamingTV_Yes + StreamingMovies_Yes +
`Contract_Month-to-month` + `Contract_One year` + PaperlessBilling_Yes +
`PaymentMethod_Electronic check` + `tenure_group_0 - 6 Mos`,
family = binomial(link = 'logit'),
data = training)
LogModel2
##
## Call: glm(formula = Churn ~ TotalCharges + MultipleLines_Yes + InternetService_DSL +
## `InternetService_Fiber optic` + OnlineSecurity_Yes + TechSupport_Yes +
## StreamingTV_Yes + StreamingMovies_Yes + `Contract_Month-to-month` +
## `Contract_One year` + PaperlessBilling_Yes + `PaymentMethod_Electronic check` +
## `tenure_group_0 - 6 Mos`, family = binomial(link = "logit"),
## data = training)
##
## Coefficients:
## (Intercept) TotalCharges
## -4.262754 -0.000187
## MultipleLines_Yes InternetService_DSL
## 0.289061 0.931977
## `InternetService_Fiber optic` OnlineSecurity_Yes
## 1.914219 -0.444324
## TechSupport_Yes StreamingTV_Yes
## -0.286239 0.295537
## StreamingMovies_Yes `Contract_Month-to-month`
## 0.394641 1.779448
## `Contract_One year` PaperlessBilling_Yes
## 0.991664 0.385197
## `PaymentMethod_Electronic check` `tenure_group_0 - 6 Mos`
## 0.362685 1.050659
##
## Degrees of Freedom: 4930 Total (i.e. Null); 4917 Residual
## Null Deviance: 5710
## Residual Deviance: 4080 AIC: 4110
Again, the summary confirms what we saw earlier regarding the most significant features.
summary(LogModel2)
##
## Call:
## glm(formula = Churn ~ TotalCharges + MultipleLines_Yes + InternetService_DSL +
## `InternetService_Fiber optic` + OnlineSecurity_Yes + TechSupport_Yes +
## StreamingTV_Yes + StreamingMovies_Yes + `Contract_Month-to-month` +
## `Contract_One year` + PaperlessBilling_Yes + `PaymentMethod_Electronic check` +
## `tenure_group_0 - 6 Mos`, family = binomial(link = "logit"),
## data = training)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.124 -0.660 -0.288 0.621 2.993
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -4.26e+00 2.30e-01 -18.52 < 2e-16 ***
## TotalCharges -1.87e-04 3.15e-05 -5.92 3.1e-09 ***
## MultipleLines_Yes 2.89e-01 9.33e-02 3.10 0.0019 **
## InternetService_DSL 9.32e-01 1.53e-01 6.08 1.2e-09 ***
## `InternetService_Fiber optic` 1.91e+00 1.60e-01 11.95 < 2e-16 ***
## OnlineSecurity_Yes -4.44e-01 1.02e-01 -4.36 1.3e-05 ***
## TechSupport_Yes -2.86e-01 1.02e-01 -2.81 0.0050 **
## StreamingTV_Yes 2.96e-01 9.71e-02 3.04 0.0023 **
## StreamingMovies_Yes 3.95e-01 9.64e-02 4.10 4.2e-05 ***
## `Contract_Month-to-month` 1.78e+00 2.04e-01 8.71 < 2e-16 ***
## `Contract_One year` 9.92e-01 2.06e-01 4.82 1.4e-06 ***
## PaperlessBilling_Yes 3.85e-01 8.92e-02 4.32 1.6e-05 ***
## `PaymentMethod_Electronic check` 3.63e-01 8.30e-02 4.37 1.2e-05 ***
## `tenure_group_0 - 6 Mos` 1.05e+00 1.05e-01 10.01 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 5707.1 on 4930 degrees of freedom
## Residual deviance: 4077.4 on 4917 degrees of freedom
## AIC: 4105
##
## Number of Fisher Scoring iterations: 6
The ANOVA of the second log regression model is similar to the first.
anova(LogModel2, test = 'Chisq')
## Analysis of Deviance Table
##
## Model: binomial, link: logit
##
## Response: Churn
##
## Terms added sequentially (first to last)
##
##
## Df Deviance Resid. Df Resid. Dev Pr(>Chi)
## NULL 4930 5707
## TotalCharges 1 188 4929 5519 < 2e-16
## MultipleLines_Yes 1 114 4928 5405 < 2e-16
## InternetService_DSL 1 62 4927 5344 4.0e-15
## `InternetService_Fiber optic` 1 861 4926 4482 < 2e-16
## OnlineSecurity_Yes 1 57 4925 4425 4.4e-14
## TechSupport_Yes 1 24 4924 4401 8.3e-07
## StreamingTV_Yes 1 24 4923 4376 7.7e-07
## StreamingMovies_Yes 1 19 4922 4358 1.4e-05
## `Contract_Month-to-month` 1 114 4921 4243 < 2e-16
## `Contract_One year` 1 23 4920 4220 1.3e-06
## PaperlessBilling_Yes 1 17 4919 4203 3.3e-05
## `PaymentMethod_Electronic check` 1 22 4918 4181 2.7e-06
## `tenure_group_0 - 6 Mos` 1 103 4917 4077 < 2e-16
##
## NULL
## TotalCharges ***
## MultipleLines_Yes ***
## InternetService_DSL ***
## `InternetService_Fiber optic` ***
## OnlineSecurity_Yes ***
## TechSupport_Yes ***
## StreamingTV_Yes ***
## StreamingMovies_Yes ***
## `Contract_Month-to-month` ***
## `Contract_One year` ***
## PaperlessBilling_Yes ***
## `PaymentMethod_Electronic check` ***
## `tenure_group_0 - 6 Mos` ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
We can now develop predictions about whether a customer is likely to churn.
glm.fit <- predict.glm(LogModel, newdata = testing, type = 'response')
glm.fit2 <- predict.glm(LogModel2, newdata = testing, type = 'response')
glm.fit <- round(glm.fit, 0)
glm.fit <- as.character(glm.fit)
glm.fit2 <- round(glm.fit2, 0)
glm.fit2 <- as.character(glm.fit2)
An accuracy test for the first log regression model shows it is 82% accurate. This is a 10% improvement over guessing.
testing$Churn <- as.character(testing$Churn)
misClassErr <- mean(glm.fit != testing$Churn)
misClassErr2 <- mean(glm.fit2 != testing$Churn)
print('Accuracy Rate LogMod'); 1 - misClassErr
## [1] "Accuracy Rate LogMod"
## [1] 0.7973
We get the same results for the accuracy of the second model.
print('Accuracy Rate LogMod2'); 1 - misClassErr2
## [1] "Accuracy Rate LogMod2"
## [1] 0.7992
As can be seen by the confusion tables that follow, the models appear to be better at predicting which customers are least likely to churn than they are at identifying customers who are most likely to churn. False positives make up 30% of positive predictions, whereas false negatives make up 15% of negative predictions.
print("Confusion Matrix for Log Regression Model 1"); table(testing$Churn, glm.fit > 0.5)
## [1] "Confusion Matrix for Log Regression Model 1"
##
## FALSE TRUE
## 0 1389 163
## 1 265 295
print("Confusion Matrix for Log Regression Model 2"); table(testing$Churn, glm.fit2 > 0.5)
## [1] "Confusion Matrix for Log Regression Model 2"
##
## FALSE TRUE
## 0 1401 151
## 1 273 287
Whether a customer churns or not is a binary decision – “Yes” or “No”. Decision trees are made for this kind of problem. In this first pass, just four factors are considered significant for predictive purposes. They are identified in the output that follows. We see the error rate is roughly 21%, on par with the error rates from the log regression models.
NoChurn <- ifelse(ChurnDataDum$Churn == 0, 'No', 'Yes')
ChurnDataDum <- data.frame(NoChurn, ChurnDataDum)
DT.Churn <- tree(NoChurn ~. -Churn, data = ChurnDataDum)
summary(DT.Churn)
##
## Classification tree:
## tree(formula = NoChurn ~ . - Churn, data = ChurnDataDum)
## Variables actually used in tree construction:
## [1] "Contract_Month.to.month" "InternetService_Fiber.optic"
## [3] "tenure_group_0...6.Mos" "TotalCharges"
## Number of terminal nodes: 6
## Residual mean deviance: 0.881 = 6200 / 7040
## Misclassification error rate: 0.209 = 1470 / 7043
A visual of the tree contributes to understanding the importance of the four features and how they rank with each other. We can see this tree consists of six terminal nodes, only one of which results in the customer’s decision to leave service. In this analysis, a customer on a month-to-month contract, with a fiber optic connection, who has paid less than $1556 in total charges, is a likely candidate to churn.
plot(DT.Churn); text(DT.Churn)
Using the decision tree model just developed, we can generate specific predictions. Again, decision tree accuracy is about 78%, a 6% improvement over guessing.
set.seed(805)
ChurnDataDum <- ChurnDataDum[-7043, ]
DT.train <- sample(1:nrow(ChurnDataDum), 3521)
DT.test <- ChurnDataDum[-DT.train, ]
DT.Churn2 <- tree(NoChurn ~ . -Churn, data = ChurnDataDum, subset = DT.train)
DT.pred <- predict(DT.Churn2, DT.test, type = 'class')
confusionMatrix(DT.pred, DT.test$NoChurn)
## Confusion Matrix and Statistics
##
## Reference
## Prediction No Yes
## No 2411 627
## Yes 152 331
##
## Accuracy : 0.779
## 95% CI : (0.765, 0.792)
## No Information Rate : 0.728
## P-Value [Acc > NIR] : 2.55e-12
##
## Kappa : 0.339
## Mcnemar's Test P-Value : < 2e-16
##
## Sensitivity : 0.941
## Specificity : 0.346
## Pos Pred Value : 0.794
## Neg Pred Value : 0.685
## Prevalence : 0.728
## Detection Rate : 0.685
## Detection Prevalence : 0.863
## Balanced Accuracy : 0.643
##
## 'Positive' Class : No
##
It is possible to further simplify the decision tree model by pruning back branches that don’t contribute to model accuracy.
set.seed(805)
cv.Churn <- cv.tree(DT.Churn, FUN = prune.misclass)
names(cv.Churn)
## [1] "size" "dev" "k" "method"
cv.Churn
## $size
## [1] 6 4 1
##
## $dev
## [1] 1501 1501 1869
##
## $k
## [1] -Inf 0 133
##
## $method
## [1] "misclass"
##
## attr(,"class")
## [1] "prune" "tree.sequence"
It appears a tree with four terminal nodes is just as accurate as a tree with six nodes. Again, only one node results in a customer churning, which means 75% of customers (3 of 4) do not churn in the model. That is on par with guessing.
prune.Churn <- prune.misclass(DT.Churn, best = 4)
plot(prune.Churn); text(prune.Churn)
The four-node tree is just as accurate as the six-node tree, with an accuracy rate of 78%.
prune.pred <- predict(prune.Churn, DT.test, type = 'class')
confusionMatrix(prune.pred, DT.test$NoChurn)
## Confusion Matrix and Statistics
##
## Reference
## Prediction No Yes
## No 2387 594
## Yes 176 364
##
## Accuracy : 0.781
## 95% CI : (0.767, 0.795)
## No Information Rate : 0.728
## P-Value [Acc > NIR] : 1.94e-13
##
## Kappa : 0.361
## Mcnemar's Test P-Value : < 2e-16
##
## Sensitivity : 0.931
## Specificity : 0.380
## Pos Pred Value : 0.801
## Neg Pred Value : 0.674
## Prevalence : 0.728
## Detection Rate : 0.678
## Detection Prevalence : 0.847
## Balanced Accuracy : 0.656
##
## 'Positive' Class : No
##
An ensemble method, such as random forest, builds out a large number of decision tree models and averages their results. At 79%, the accuracy rate of this approach is similar to the log regression and decision tree algorithms. This algorithm offers a 7% improvement over guessing. The random forest is a slight improvement over the decision tree, in this analysis.
set.seed(805)
rf.Churn <- randomForest(NoChurn ~ . -Churn, data = ChurnDataDum, subset = DT.train,
mtry = 6, importance = TRUE)
pred.rf <- predict(rf.Churn, newdata = DT.test)
confusionMatrix(pred.rf, DT.test$NoChurn)
## Confusion Matrix and Statistics
##
## Reference
## Prediction No Yes
## No 2357 543
## Yes 206 415
##
## Accuracy : 0.787
## 95% CI : (0.773, 0.801)
## No Information Rate : 0.728
## P-Value [Acc > NIR] : 2.83e-16
##
## Kappa : 0.396
## Mcnemar's Test P-Value : < 2e-16
##
## Sensitivity : 0.920
## Specificity : 0.433
## Pos Pred Value : 0.813
## Neg Pred Value : 0.668
## Prevalence : 0.728
## Detection Rate : 0.669
## Detection Prevalence : 0.824
## Balanced Accuracy : 0.676
##
## 'Positive' Class : No
##
Visualizing these results is helpful. Again, we see the models are better at identifying a customer unlikely to leave better than they are at identifying a customer who is likely to leave.
plot(pred.rf, DT.test$NoChurn, main = 'Random Forest Results',
xlab = 'Predicted Value', ylab = 'Actual Value')
Finally, we can see which features contribute most to a customer’s decision to churn. In tabular form, they are unranked.
importance(rf.Churn)
## No Yes
## gender -1.4240 -0.6984
## TotalCharges 22.3778 21.2693
## SeniorCitizen_Yes 9.7989 -3.2163
## Partner_Yes 4.3332 1.8516
## Dependents_Yes 1.2990 10.2147
## PhoneService_Yes -8.9566 19.9595
## MultipleLines_Yes 3.4685 7.9834
## InternetService_DSL -10.0015 20.9954
## InternetService_Fiber.optic 15.6548 47.8150
## OnlineSecurity_Yes 6.0971 26.8004
## OnlineBackup_Yes 10.9881 -1.6273
## DeviceProtection_Yes 16.7526 -8.9533
## TechSupport_Yes 11.5305 14.0583
## StreamingTV_Yes 10.5591 5.5190
## StreamingMovies_Yes 9.2498 2.4459
## Contract_Month.to.month 0.4334 31.5580
## Contract_One.year -15.7257 14.3068
## PaperlessBilling_Yes -0.7857 22.1208
## PaymentMethod_Electronic.check 2.2306 21.3031
## PaymentMethod_Mailed.check 2.8051 -2.4988
## PaymentMethod_Bank.transfer..automatic. -1.2166 1.3519
## tenure_group_0...6.Mos 15.2599 16.6315
## tenure_group_31...36.Mos -1.6638 0.6426
## tenure_group_43...48.Mos -2.8084 2.4723
## tenure_group_7...12.Mos 3.7959 -1.1609
## tenure_group_19...24.Mos -0.2397 4.0860
## tenure_group_25...30.Mos 0.3202 5.4111
## tenure_group_61...66.Mos 5.5270 7.2554
## tenure_group_13...18.Mos -4.2426 6.9804
## tenure_group_55...60.Mos 1.4893 9.1902
## tenure_group_49...54.Mos -6.9619 6.3927
## tenure_group_67...72.Mos 2.3431 9.8952
## MeanDecreaseAccuracy
## gender -1.5700
## TotalCharges 35.9973
## SeniorCitizen_Yes 6.2353
## Partner_Yes 5.1635
## Dependents_Yes 8.6654
## PhoneService_Yes 4.8953
## MultipleLines_Yes 8.5141
## InternetService_DSL 10.8236
## InternetService_Fiber.optic 44.8226
## OnlineSecurity_Yes 24.3397
## OnlineBackup_Yes 9.3206
## DeviceProtection_Yes 11.4520
## TechSupport_Yes 17.4874
## StreamingTV_Yes 13.5861
## StreamingMovies_Yes 10.3679
## Contract_Month.to.month 35.6396
## Contract_One.year -1.5028
## PaperlessBilling_Yes 14.7033
## PaymentMethod_Electronic.check 17.5580
## PaymentMethod_Mailed.check 1.2028
## PaymentMethod_Bank.transfer..automatic. -0.3221
## tenure_group_0...6.Mos 26.8217
## tenure_group_31...36.Mos -1.1185
## tenure_group_43...48.Mos -0.7022
## tenure_group_7...12.Mos 2.8364
## tenure_group_19...24.Mos 2.7050
## tenure_group_25...30.Mos 3.6035
## tenure_group_61...66.Mos 9.8302
## tenure_group_13...18.Mos 1.6585
## tenure_group_55...60.Mos 6.9838
## tenure_group_49...54.Mos -2.6947
## tenure_group_67...72.Mos 10.5005
## MeanDecreaseGini
## gender 30.394
## TotalCharges 227.613
## SeniorCitizen_Yes 29.927
## Partner_Yes 29.668
## Dependents_Yes 27.690
## PhoneService_Yes 15.344
## MultipleLines_Yes 30.067
## InternetService_DSL 19.731
## InternetService_Fiber.optic 80.697
## OnlineSecurity_Yes 35.915
## OnlineBackup_Yes 30.307
## DeviceProtection_Yes 29.019
## TechSupport_Yes 32.638
## StreamingTV_Yes 32.089
## StreamingMovies_Yes 30.977
## Contract_Month.to.month 106.655
## Contract_One.year 14.459
## PaperlessBilling_Yes 37.695
## PaymentMethod_Electronic.check 57.663
## PaymentMethod_Mailed.check 16.087
## PaymentMethod_Bank.transfer..automatic. 17.029
## tenure_group_0...6.Mos 53.066
## tenure_group_31...36.Mos 11.315
## tenure_group_43...48.Mos 10.202
## tenure_group_7...12.Mos 12.130
## tenure_group_19...24.Mos 12.882
## tenure_group_25...30.Mos 10.041
## tenure_group_61...66.Mos 7.765
## tenure_group_13...18.Mos 13.310
## tenure_group_55...60.Mos 10.231
## tenure_group_49...54.Mos 9.982
## tenure_group_67...72.Mos 11.519
When we rank them in graphic form, we find the top four features in the left hand chart (below) match those from the IBM analysis.
varImpPlot(rf.Churn, n.var = 10)