Step 1: Data loading
## -- Attaching packages ---------------------------------- tidyverse 1.2.1 --
## v ggplot2 3.0.0 v purrr 0.2.4
## v tibble 1.4.2 v dplyr 0.7.6
## v tidyr 0.8.1 v stringr 1.2.0
## v readr 1.1.1 v forcats 0.2.0
## Warning: package 'ggplot2' was built under R version 3.4.4
## Warning: package 'tidyr' was built under R version 3.4.4
## Warning: package 'readr' was built under R version 3.4.4
## Warning: package 'dplyr' was built under R version 3.4.4
## -- Conflicts ------------------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
## Warning: package 'data.table' was built under R version 3.4.4
##
## Attaching package: 'data.table'
## The following objects are masked from 'package:dplyr':
##
## between, first, last
## The following object is masked from 'package:purrr':
##
## transpose
# Load the dataset
dt <- read.csv(file.choose())
dt <- as.data.frame(dt)
# check the dataset
str(dt)
## 'data.frame': 7043 obs. of 21 variables:
## $ customerID : Factor w/ 7043 levels "0002-ORFBO","0003-MKNFE",..: 5376 3963 2565 5536 6512 6552 1003 4771 5605 4535 ...
## $ gender : Factor w/ 2 levels "Female","Male": 1 2 2 2 1 1 2 1 1 2 ...
## $ SeniorCitizen : int 0 0 0 0 0 0 0 0 0 0 ...
## $ Partner : Factor w/ 2 levels "No","Yes": 2 1 1 1 1 1 1 1 2 1 ...
## $ Dependents : Factor w/ 2 levels "No","Yes": 1 1 1 1 1 1 2 1 1 2 ...
## $ tenure : int 1 34 2 45 2 8 22 10 28 62 ...
## $ PhoneService : Factor w/ 2 levels "No","Yes": 1 2 2 1 2 2 2 1 2 2 ...
## $ MultipleLines : Factor w/ 3 levels "No","No phone service",..: 2 1 1 2 1 3 3 2 3 1 ...
## $ InternetService : Factor w/ 3 levels "DSL","Fiber optic",..: 1 1 1 1 2 2 2 1 2 1 ...
## $ OnlineSecurity : Factor w/ 3 levels "No","No internet service",..: 1 3 3 3 1 1 1 3 1 3 ...
## $ OnlineBackup : Factor w/ 3 levels "No","No internet service",..: 3 1 3 1 1 1 3 1 1 3 ...
## $ DeviceProtection: Factor w/ 3 levels "No","No internet service",..: 1 3 1 3 1 3 1 1 3 1 ...
## $ TechSupport : Factor w/ 3 levels "No","No internet service",..: 1 1 1 3 1 1 1 1 3 1 ...
## $ StreamingTV : Factor w/ 3 levels "No","No internet service",..: 1 1 1 1 1 3 3 1 3 1 ...
## $ StreamingMovies : Factor w/ 3 levels "No","No internet service",..: 1 1 1 1 1 3 1 1 3 1 ...
## $ Contract : Factor w/ 3 levels "Month-to-month",..: 1 2 1 2 1 1 1 1 1 2 ...
## $ PaperlessBilling: Factor w/ 2 levels "No","Yes": 2 1 2 1 2 2 2 1 2 1 ...
## $ PaymentMethod : Factor w/ 4 levels "Bank transfer (automatic)",..: 3 4 4 1 3 3 2 4 3 1 ...
## $ MonthlyCharges : num 29.9 57 53.9 42.3 70.7 ...
## $ TotalCharges : num 29.9 1889.5 108.2 1840.8 151.7 ...
## $ Churn : Factor w/ 2 levels "No","Yes": 1 1 2 1 2 2 1 1 2 1 ...
#check the data type of each column
sapply(dt, class)
## customerID gender SeniorCitizen Partner
## "factor" "factor" "integer" "factor"
## Dependents tenure PhoneService MultipleLines
## "factor" "integer" "factor" "factor"
## InternetService OnlineSecurity OnlineBackup DeviceProtection
## "factor" "factor" "factor" "factor"
## TechSupport StreamingTV StreamingMovies Contract
## "factor" "factor" "factor" "factor"
## PaperlessBilling PaymentMethod MonthlyCharges TotalCharges
## "factor" "factor" "numeric" "numeric"
## Churn
## "factor"
Step 2: Data wrangling and transformation
# check the columns with NA values
sapply(dt, function(x) sum(is.na(x)))
## customerID gender SeniorCitizen Partner
## 0 0 0 0
## Dependents tenure PhoneService MultipleLines
## 0 0 0 0
## InternetService OnlineSecurity OnlineBackup DeviceProtection
## 0 0 0 0
## TechSupport StreamingTV StreamingMovies Contract
## 0 0 0 0
## PaperlessBilling PaymentMethod MonthlyCharges TotalCharges
## 0 0 0 11
## Churn
## 0
# remove the rows with NA values
dt <- dt[complete.cases(dt) == TRUE, ]
# check the unique calues for each column
for (col in names(dt)){
if (class(dt[[col]]) %in% c("character", "factor") & col != 'customerID'){
print(unique(dt[col]))
}
}
## gender
## 1 Female
## 2 Male
## Partner
## 1 Yes
## 2 No
## Dependents
## 1 No
## 7 Yes
## PhoneService
## 1 No
## 2 Yes
## MultipleLines
## 1 No phone service
## 2 No
## 6 Yes
## InternetService
## 1 DSL
## 5 Fiber optic
## 12 No
## OnlineSecurity
## 1 No
## 2 Yes
## 12 No internet service
## OnlineBackup
## 1 Yes
## 2 No
## 12 No internet service
## DeviceProtection
## 1 No
## 2 Yes
## 12 No internet service
## TechSupport
## 1 No
## 4 Yes
## 12 No internet service
## StreamingTV
## 1 No
## 6 Yes
## 12 No internet service
## StreamingMovies
## 1 No
## 6 Yes
## 12 No internet service
## Contract
## 1 Month-to-month
## 2 One year
## 12 Two year
## PaperlessBilling
## 1 Yes
## 2 No
## PaymentMethod
## 1 Electronic check
## 2 Mailed check
## 4 Bank transfer (automatic)
## 7 Credit card (automatic)
## Churn
## 1 No
## 3 Yes
#need to change "No internet service" to "No" for "OnlineSecurity", #"OnlineBackup", "DeviceProtection", "TechSupport", "streamingTV", "streamingMovies".
for(i in c(10:15)) {
print(names(dt[i]))
}
## [1] "OnlineSecurity"
## [1] "OnlineBackup"
## [1] "DeviceProtection"
## [1] "TechSupport"
## [1] "StreamingTV"
## [1] "StreamingMovies"
for(i in c(10:15)) {
dt[i][dt[i]=="No internet service"] <- "No"
}
# change "No phone service" to "No" for MultipleLines
dt$MultipleLines[dt$MultipleLines == "No phone service"] <- "No"
#remove CustomerID column
dt$customerID <- NULL
#check tenure
min(dt$tenure); max(dt$tenure)
## [1] 1
## [1] 72
#transform tenure to
dt$tenure <- as.factor(ifelse(dt$tenure == 0, '0',
ifelse(dt$tenure <= 24, '1',
ifelse(dt$tenure <= 48, '2', '3'
))))
#tranform Churn values from Yes-no to 1-0
dt$Churn <- as.factor(ifelse(dt$Churn == "Yes", 1, 0))
#change SeniorCitizen type to factor
dt$SeniorCitizen <- as.factor(dt$SeniorCitizen)
# transform the needed columns to factor data type
for (col in names(dt)){
if (class(dt[[col]]) == "character"){
dt <- dt%>%
mutate_at(col, funs(factor(.)))
}
}
#check the data type of each column again
str(dt)
## 'data.frame': 7032 obs. of 20 variables:
## $ gender : Factor w/ 2 levels "Female","Male": 1 2 2 2 1 1 2 1 1 2 ...
## $ SeniorCitizen : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
## $ Partner : Factor w/ 2 levels "No","Yes": 2 1 1 1 1 1 1 1 2 1 ...
## $ Dependents : Factor w/ 2 levels "No","Yes": 1 1 1 1 1 1 2 1 1 2 ...
## $ tenure : Factor w/ 3 levels "1","2","3": 1 2 1 2 1 1 1 1 2 3 ...
## $ PhoneService : Factor w/ 2 levels "No","Yes": 1 2 2 1 2 2 2 1 2 2 ...
## $ MultipleLines : Factor w/ 3 levels "No","No phone service",..: 1 1 1 1 1 3 3 1 3 1 ...
## $ InternetService : Factor w/ 3 levels "DSL","Fiber optic",..: 1 1 1 1 2 2 2 1 2 1 ...
## $ OnlineSecurity : Factor w/ 3 levels "No","No internet service",..: 1 3 3 3 1 1 1 3 1 3 ...
## $ OnlineBackup : Factor w/ 3 levels "No","No internet service",..: 3 1 3 1 1 1 3 1 1 3 ...
## $ DeviceProtection: Factor w/ 3 levels "No","No internet service",..: 1 3 1 3 1 3 1 1 3 1 ...
## $ TechSupport : Factor w/ 3 levels "No","No internet service",..: 1 1 1 3 1 1 1 1 3 1 ...
## $ StreamingTV : Factor w/ 3 levels "No","No internet service",..: 1 1 1 1 1 3 3 1 3 1 ...
## $ StreamingMovies : Factor w/ 3 levels "No","No internet service",..: 1 1 1 1 1 3 1 1 3 1 ...
## $ Contract : Factor w/ 3 levels "Month-to-month",..: 1 2 1 2 1 1 1 1 1 2 ...
## $ PaperlessBilling: Factor w/ 2 levels "No","Yes": 2 1 2 1 2 2 2 1 2 1 ...
## $ PaymentMethod : Factor w/ 4 levels "Bank transfer (automatic)",..: 3 4 4 1 3 3 2 4 3 1 ...
## $ MonthlyCharges : num 29.9 57 53.9 42.3 70.7 ...
## $ TotalCharges : num 29.9 1889.5 108.2 1840.8 151.7 ...
## $ Churn : Factor w/ 2 levels "0","1": 1 1 2 1 2 2 1 1 2 1 ...
Step 3: Preparing test and training data
#Check the numeric variables and correlation between them
cols <- sapply(dt, is.numeric)
cor(dt%>%select(MonthlyCharges, TotalCharges))
## MonthlyCharges TotalCharges
## MonthlyCharges 1.0000000 0.6510648
## TotalCharges 0.6510648 1.0000000
dt$TotalCharges <- NULL
#generate random sample
set.seed(999)
sample <- sample.int(n = nrow(dt), size = floor(0.7*nrow(dt)), replace = F)
# split data to train and test datasets
train <- dt[sample, ]
test <- dt[-sample, ]
dim(train); dim(test)
## [1] 4922 19
## [1] 2110 19
First Model: General linear model
#create logistic regression.
LogModel <- glm(Churn ~ .,family=binomial(link="logit"),data=train)
print(summary(LogModel))
##
## Call:
## glm(formula = Churn ~ ., family = binomial(link = "logit"), data = train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.8526 -0.6763 -0.3117 0.7428 3.0829
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.057963 0.950965 0.061 0.951398
## genderMale 0.002734 0.076868 0.036 0.971628
## SeniorCitizen1 0.268926 0.099729 2.697 0.007006
## PartnerYes -0.097740 0.091170 -1.072 0.283691
## DependentsYes -0.062328 0.106214 -0.587 0.557324
## tenure2 -0.834858 0.109062 -7.655 1.93e-14
## tenure3 -1.052650 0.141704 -7.429 1.10e-13
## PhoneServiceYes 0.101311 0.766154 0.132 0.894800
## MultipleLinesYes 0.275235 0.208621 1.319 0.187067
## InternetServiceFiber optic 1.496699 0.943214 1.587 0.112557
## InternetServiceNo -1.588205 0.952715 -1.667 0.095508
## OnlineSecurityYes -0.319207 0.211096 -1.512 0.130498
## OnlineBackupYes -0.063789 0.207233 -0.308 0.758225
## DeviceProtectionYes 0.115565 0.207933 0.556 0.578362
## TechSupportYes -0.273304 0.213394 -1.281 0.200281
## StreamingTVYes 0.418491 0.385555 1.085 0.277733
## StreamingMoviesYes 0.490949 0.385289 1.274 0.202580
## ContractOne year -0.717008 0.123875 -5.788 7.12e-09
## ContractTwo year -1.599819 0.200084 -7.996 1.29e-15
## PaperlessBillingYes 0.348024 0.088038 3.953 7.71e-05
## PaymentMethodCredit card (automatic) -0.026721 0.133284 -0.200 0.841105
## PaymentMethodElectronic check 0.425187 0.111787 3.804 0.000143
## PaymentMethodMailed check 0.041133 0.136285 0.302 0.762794
## MonthlyCharges -0.024963 0.037454 -0.666 0.505098
##
## (Intercept)
## genderMale
## SeniorCitizen1 **
## PartnerYes
## DependentsYes
## tenure2 ***
## tenure3 ***
## PhoneServiceYes
## MultipleLinesYes
## InternetServiceFiber optic
## InternetServiceNo .
## OnlineSecurityYes
## OnlineBackupYes
## DeviceProtectionYes
## TechSupportYes
## StreamingTVYes
## StreamingMoviesYes
## ContractOne year ***
## ContractTwo year ***
## PaperlessBillingYes ***
## PaymentMethodCredit card (automatic)
## PaymentMethodElectronic check ***
## PaymentMethodMailed check
## MonthlyCharges
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 5681.1 on 4921 degrees of freedom
## Residual deviance: 4168.8 on 4898 degrees of freedom
## AIC: 4216.8
##
## Number of Fisher Scoring iterations: 6
#analyze the deviance table with anova
anova(LogModel, test="Chisq")
## Analysis of Deviance Table
##
## Model: binomial, link: logit
##
## Response: Churn
##
## Terms added sequentially (first to last)
##
##
## Df Deviance Resid. Df Resid. Dev Pr(>Chi)
## NULL 4921 5681.1
## gender 1 0.01 4920 5681.1 0.907052
## SeniorCitizen 1 126.93 4919 5554.2 < 2.2e-16 ***
## Partner 1 101.04 4918 5453.1 < 2.2e-16 ***
## Dependents 1 27.79 4917 5425.3 1.353e-07 ***
## tenure 2 419.20 4915 5006.1 < 2.2e-16 ***
## PhoneService 1 1.30 4914 5004.8 0.254274
## MultipleLines 1 88.59 4913 4916.2 < 2.2e-16 ***
## InternetService 2 481.20 4911 4435.1 < 2.2e-16 ***
## OnlineSecurity 1 56.60 4910 4378.4 5.335e-14 ***
## OnlineBackup 1 6.18 4909 4372.3 0.012917 *
## DeviceProtection 1 2.63 4908 4369.6 0.104753
## TechSupport 1 38.34 4907 4331.3 5.953e-10 ***
## StreamingTV 1 7.77 4906 4323.5 0.005318 **
## StreamingMovies 1 7.86 4905 4315.7 0.005066 **
## Contract 2 102.76 4903 4212.9 < 2.2e-16 ***
## PaperlessBilling 1 17.18 4902 4195.7 3.407e-05 ***
## PaymentMethod 3 26.52 4899 4169.2 7.409e-06 ***
## MonthlyCharges 1 0.44 4898 4168.8 0.504985
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#create prediction model
predict_glm <- predict(LogModel, newdata = test[, -19], type = 'response')
test$Churn <- as.character(test$Churn)
test$Churn[test$Churn=="No"] <- "0"
test$Churn[test$Churn=="Yes"] <- "1"
#set the predicted value of churn by threshold = 0.5
predict_glm <- ifelse(predict_glm >0.5, 1, 0)
# get the accuracy of the model
misClasificError <- mean(predict_glm != test$Churn)
print(paste('Logistic Regression Accuracy',1-misClasificError))
## [1] "Logistic Regression Accuracy 0.801895734597156"
# Confusion matrix
table(test$Churn, predict_glm)
## predict_glm
## 0 1
## 0 1386 154
## 1 264 306
Second Model: Decision Tree
#way 1: with library "party"
library("party")
## Warning: package 'party' was built under R version 3.4.4
## Loading required package: grid
## Loading required package: mvtnorm
## Loading required package: modeltools
## Loading required package: stats4
## Loading required package: strucchange
## Warning: package 'strucchange' was built under R version 3.4.4
## Loading required package: zoo
##
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
## Loading required package: sandwich
## Warning: package 'sandwich' was built under R version 3.4.4
##
## Attaching package: 'strucchange'
## The following object is masked from 'package:stringr':
##
## boundary
tree <- ctree(Churn~Contract+tenure+PaperlessBilling, train)
plot(tree, type='simple')

# Make prediction on Test dataset
pred_tree <- predict(tree, test)
#Confusion Matrix for Decision Tree
table(Predicted = pred_tree, Actual = test$Churn)
## Actual
## Predicted 0 1
## 0 1308 267
## 1 232 303
p1 <- predict(tree, train)
tab1 <- table(Predicted = p1, Actual = train$Churn)
tab2 <- table(Predicted = pred_tree, Actual = test$Churn)
#Accuracy of Decision tree model on Train dataset
(tab1[1,1] + tab1[2,2]) / sum(tab1)
## [1] 0.756603
#Accuracy of Decision tree model on Test dataset
(tab2[1,1] + tab2[2,2]) / sum(tab2)
## [1] 0.7635071
# way2: with library "rpart"
library(rpart)
library("rpart.plot")
tree2 <- rpart(data=train, Churn ~ Contract+tenure+PaperlessBilling, method='class')
rpart.plot(tree2)

Third Model: Random Forrest
## Warning: package 'randomForest' was built under R version 3.4.4
## randomForest 4.6-14
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
## The following object is masked from 'package:dplyr':
##
## combine
## The following object is masked from 'package:ggplot2':
##
## margin
#create model with training data and see the confusion matrix
rfModel <- randomForest(Churn ~., data = train)
print(rfModel)
##
## Call:
## randomForest(formula = Churn ~ ., data = train)
## Type of random forest: classification
## Number of trees: 500
## No. of variables tried at each split: 4
##
## OOB estimate of error rate: 21.68%
## Confusion matrix:
## 0 1 class.error
## 0 3232 391 0.1079216
## 1 676 623 0.5204003
# predict churn on the test dataset with the created model
pred_rf <- predict(rfModel, test)
# check confusion matrix
table(pred_rf, as.factor(test$Churn))
##
## pred_rf 0 1
## 0 1395 298
## 1 145 272
# plot model to check error dynamics for the different number of trees
plot(rfModel)

varImpPlot(rfModel, sort=T, n.var = 10, main = 'Top 10 Feature Importance')
