Step 1: Data loading

library('tidyverse')
## -- Attaching packages ---------------------------------- tidyverse 1.2.1 --
## v ggplot2 3.0.0     v purrr   0.2.4
## v tibble  1.4.2     v dplyr   0.7.6
## v tidyr   0.8.1     v stringr 1.2.0
## v readr   1.1.1     v forcats 0.2.0
## Warning: package 'ggplot2' was built under R version 3.4.4
## Warning: package 'tidyr' was built under R version 3.4.4
## Warning: package 'readr' was built under R version 3.4.4
## Warning: package 'dplyr' was built under R version 3.4.4
## -- Conflicts ------------------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
library('data.table')
## Warning: package 'data.table' was built under R version 3.4.4
## 
## Attaching package: 'data.table'
## The following objects are masked from 'package:dplyr':
## 
##     between, first, last
## The following object is masked from 'package:purrr':
## 
##     transpose
# Load the dataset

dt <- read.csv(file.choose())
dt <- as.data.frame(dt)

# check the dataset
str(dt)
## 'data.frame':    7043 obs. of  21 variables:
##  $ customerID      : Factor w/ 7043 levels "0002-ORFBO","0003-MKNFE",..: 5376 3963 2565 5536 6512 6552 1003 4771 5605 4535 ...
##  $ gender          : Factor w/ 2 levels "Female","Male": 1 2 2 2 1 1 2 1 1 2 ...
##  $ SeniorCitizen   : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ Partner         : Factor w/ 2 levels "No","Yes": 2 1 1 1 1 1 1 1 2 1 ...
##  $ Dependents      : Factor w/ 2 levels "No","Yes": 1 1 1 1 1 1 2 1 1 2 ...
##  $ tenure          : int  1 34 2 45 2 8 22 10 28 62 ...
##  $ PhoneService    : Factor w/ 2 levels "No","Yes": 1 2 2 1 2 2 2 1 2 2 ...
##  $ MultipleLines   : Factor w/ 3 levels "No","No phone service",..: 2 1 1 2 1 3 3 2 3 1 ...
##  $ InternetService : Factor w/ 3 levels "DSL","Fiber optic",..: 1 1 1 1 2 2 2 1 2 1 ...
##  $ OnlineSecurity  : Factor w/ 3 levels "No","No internet service",..: 1 3 3 3 1 1 1 3 1 3 ...
##  $ OnlineBackup    : Factor w/ 3 levels "No","No internet service",..: 3 1 3 1 1 1 3 1 1 3 ...
##  $ DeviceProtection: Factor w/ 3 levels "No","No internet service",..: 1 3 1 3 1 3 1 1 3 1 ...
##  $ TechSupport     : Factor w/ 3 levels "No","No internet service",..: 1 1 1 3 1 1 1 1 3 1 ...
##  $ StreamingTV     : Factor w/ 3 levels "No","No internet service",..: 1 1 1 1 1 3 3 1 3 1 ...
##  $ StreamingMovies : Factor w/ 3 levels "No","No internet service",..: 1 1 1 1 1 3 1 1 3 1 ...
##  $ Contract        : Factor w/ 3 levels "Month-to-month",..: 1 2 1 2 1 1 1 1 1 2 ...
##  $ PaperlessBilling: Factor w/ 2 levels "No","Yes": 2 1 2 1 2 2 2 1 2 1 ...
##  $ PaymentMethod   : Factor w/ 4 levels "Bank transfer (automatic)",..: 3 4 4 1 3 3 2 4 3 1 ...
##  $ MonthlyCharges  : num  29.9 57 53.9 42.3 70.7 ...
##  $ TotalCharges    : num  29.9 1889.5 108.2 1840.8 151.7 ...
##  $ Churn           : Factor w/ 2 levels "No","Yes": 1 1 2 1 2 2 1 1 2 1 ...
#check the data type of each column
sapply(dt, class)
##       customerID           gender    SeniorCitizen          Partner 
##         "factor"         "factor"        "integer"         "factor" 
##       Dependents           tenure     PhoneService    MultipleLines 
##         "factor"        "integer"         "factor"         "factor" 
##  InternetService   OnlineSecurity     OnlineBackup DeviceProtection 
##         "factor"         "factor"         "factor"         "factor" 
##      TechSupport      StreamingTV  StreamingMovies         Contract 
##         "factor"         "factor"         "factor"         "factor" 
## PaperlessBilling    PaymentMethod   MonthlyCharges     TotalCharges 
##         "factor"         "factor"        "numeric"        "numeric" 
##            Churn 
##         "factor"

Step 2: Data wrangling and transformation

# check the columns with NA values
sapply(dt, function(x) sum(is.na(x)))
##       customerID           gender    SeniorCitizen          Partner 
##                0                0                0                0 
##       Dependents           tenure     PhoneService    MultipleLines 
##                0                0                0                0 
##  InternetService   OnlineSecurity     OnlineBackup DeviceProtection 
##                0                0                0                0 
##      TechSupport      StreamingTV  StreamingMovies         Contract 
##                0                0                0                0 
## PaperlessBilling    PaymentMethod   MonthlyCharges     TotalCharges 
##                0                0                0               11 
##            Churn 
##                0
# remove the rows with NA values
dt <- dt[complete.cases(dt) == TRUE, ]


# check the unique calues for each column

for (col in names(dt)){
    if (class(dt[[col]]) %in% c("character", "factor") & col != 'customerID'){
          print(unique(dt[col]))  
    }
}
##   gender
## 1 Female
## 2   Male
##   Partner
## 1     Yes
## 2      No
##   Dependents
## 1         No
## 7        Yes
##   PhoneService
## 1           No
## 2          Yes
##      MultipleLines
## 1 No phone service
## 2               No
## 6              Yes
##    InternetService
## 1              DSL
## 5      Fiber optic
## 12              No
##         OnlineSecurity
## 1                   No
## 2                  Yes
## 12 No internet service
##           OnlineBackup
## 1                  Yes
## 2                   No
## 12 No internet service
##       DeviceProtection
## 1                   No
## 2                  Yes
## 12 No internet service
##            TechSupport
## 1                   No
## 4                  Yes
## 12 No internet service
##            StreamingTV
## 1                   No
## 6                  Yes
## 12 No internet service
##        StreamingMovies
## 1                   No
## 6                  Yes
## 12 No internet service
##          Contract
## 1  Month-to-month
## 2        One year
## 12       Two year
##   PaperlessBilling
## 1              Yes
## 2               No
##               PaymentMethod
## 1          Electronic check
## 2              Mailed check
## 4 Bank transfer (automatic)
## 7   Credit card (automatic)
##   Churn
## 1    No
## 3   Yes
#need to change "No internet service" to "No" for "OnlineSecurity", #"OnlineBackup", "DeviceProtection", "TechSupport", "streamingTV", "streamingMovies".

for(i in c(10:15)) {
  print(names(dt[i]))
}
## [1] "OnlineSecurity"
## [1] "OnlineBackup"
## [1] "DeviceProtection"
## [1] "TechSupport"
## [1] "StreamingTV"
## [1] "StreamingMovies"
for(i in c(10:15)) {
  dt[i][dt[i]=="No internet service"] <- "No"
}

# change "No phone service" to "No" for MultipleLines
dt$MultipleLines[dt$MultipleLines == "No phone service"] <- "No"


#remove CustomerID column
dt$customerID <- NULL

#check tenure
min(dt$tenure); max(dt$tenure)
## [1] 1
## [1] 72
#transform tenure to 
dt$tenure <- as.factor(ifelse(dt$tenure == 0, '0',
                   ifelse(dt$tenure <= 24, '1',
                          ifelse(dt$tenure <= 48, '2', '3'
                          ))))

#tranform Churn values from Yes-no to 1-0
dt$Churn <- as.factor(ifelse(dt$Churn == "Yes", 1, 0))

#change SeniorCitizen type to factor
dt$SeniorCitizen <- as.factor(dt$SeniorCitizen)

# transform the needed columns to factor data type

for (col in names(dt)){
    if (class(dt[[col]]) == "character"){
          dt <- dt%>%
          mutate_at(col, funs(factor(.)))
    }
}


#check the data type of each column again
str(dt)
## 'data.frame':    7032 obs. of  20 variables:
##  $ gender          : Factor w/ 2 levels "Female","Male": 1 2 2 2 1 1 2 1 1 2 ...
##  $ SeniorCitizen   : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
##  $ Partner         : Factor w/ 2 levels "No","Yes": 2 1 1 1 1 1 1 1 2 1 ...
##  $ Dependents      : Factor w/ 2 levels "No","Yes": 1 1 1 1 1 1 2 1 1 2 ...
##  $ tenure          : Factor w/ 3 levels "1","2","3": 1 2 1 2 1 1 1 1 2 3 ...
##  $ PhoneService    : Factor w/ 2 levels "No","Yes": 1 2 2 1 2 2 2 1 2 2 ...
##  $ MultipleLines   : Factor w/ 3 levels "No","No phone service",..: 1 1 1 1 1 3 3 1 3 1 ...
##  $ InternetService : Factor w/ 3 levels "DSL","Fiber optic",..: 1 1 1 1 2 2 2 1 2 1 ...
##  $ OnlineSecurity  : Factor w/ 3 levels "No","No internet service",..: 1 3 3 3 1 1 1 3 1 3 ...
##  $ OnlineBackup    : Factor w/ 3 levels "No","No internet service",..: 3 1 3 1 1 1 3 1 1 3 ...
##  $ DeviceProtection: Factor w/ 3 levels "No","No internet service",..: 1 3 1 3 1 3 1 1 3 1 ...
##  $ TechSupport     : Factor w/ 3 levels "No","No internet service",..: 1 1 1 3 1 1 1 1 3 1 ...
##  $ StreamingTV     : Factor w/ 3 levels "No","No internet service",..: 1 1 1 1 1 3 3 1 3 1 ...
##  $ StreamingMovies : Factor w/ 3 levels "No","No internet service",..: 1 1 1 1 1 3 1 1 3 1 ...
##  $ Contract        : Factor w/ 3 levels "Month-to-month",..: 1 2 1 2 1 1 1 1 1 2 ...
##  $ PaperlessBilling: Factor w/ 2 levels "No","Yes": 2 1 2 1 2 2 2 1 2 1 ...
##  $ PaymentMethod   : Factor w/ 4 levels "Bank transfer (automatic)",..: 3 4 4 1 3 3 2 4 3 1 ...
##  $ MonthlyCharges  : num  29.9 57 53.9 42.3 70.7 ...
##  $ TotalCharges    : num  29.9 1889.5 108.2 1840.8 151.7 ...
##  $ Churn           : Factor w/ 2 levels "0","1": 1 1 2 1 2 2 1 1 2 1 ...

Step 3: Preparing test and training data

#Check the numeric variables and correlation between them
cols <- sapply(dt, is.numeric)


cor(dt%>%select(MonthlyCharges, TotalCharges))
##                MonthlyCharges TotalCharges
## MonthlyCharges      1.0000000    0.6510648
## TotalCharges        0.6510648    1.0000000
dt$TotalCharges <- NULL

#generate random sample 
set.seed(999) 
sample <- sample.int(n = nrow(dt), size = floor(0.7*nrow(dt)), replace = F)

# split data to train and test datasets
train <- dt[sample, ]
test  <- dt[-sample, ]

dim(train); dim(test)
## [1] 4922   19
## [1] 2110   19

First Model: General linear model

#create logistic regression.
LogModel <- glm(Churn ~ .,family=binomial(link="logit"),data=train)
print(summary(LogModel))
## 
## Call:
## glm(formula = Churn ~ ., family = binomial(link = "logit"), data = train)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.8526  -0.6763  -0.3117   0.7428   3.0829  
## 
## Coefficients:
##                                       Estimate Std. Error z value Pr(>|z|)
## (Intercept)                           0.057963   0.950965   0.061 0.951398
## genderMale                            0.002734   0.076868   0.036 0.971628
## SeniorCitizen1                        0.268926   0.099729   2.697 0.007006
## PartnerYes                           -0.097740   0.091170  -1.072 0.283691
## DependentsYes                        -0.062328   0.106214  -0.587 0.557324
## tenure2                              -0.834858   0.109062  -7.655 1.93e-14
## tenure3                              -1.052650   0.141704  -7.429 1.10e-13
## PhoneServiceYes                       0.101311   0.766154   0.132 0.894800
## MultipleLinesYes                      0.275235   0.208621   1.319 0.187067
## InternetServiceFiber optic            1.496699   0.943214   1.587 0.112557
## InternetServiceNo                    -1.588205   0.952715  -1.667 0.095508
## OnlineSecurityYes                    -0.319207   0.211096  -1.512 0.130498
## OnlineBackupYes                      -0.063789   0.207233  -0.308 0.758225
## DeviceProtectionYes                   0.115565   0.207933   0.556 0.578362
## TechSupportYes                       -0.273304   0.213394  -1.281 0.200281
## StreamingTVYes                        0.418491   0.385555   1.085 0.277733
## StreamingMoviesYes                    0.490949   0.385289   1.274 0.202580
## ContractOne year                     -0.717008   0.123875  -5.788 7.12e-09
## ContractTwo year                     -1.599819   0.200084  -7.996 1.29e-15
## PaperlessBillingYes                   0.348024   0.088038   3.953 7.71e-05
## PaymentMethodCredit card (automatic) -0.026721   0.133284  -0.200 0.841105
## PaymentMethodElectronic check         0.425187   0.111787   3.804 0.000143
## PaymentMethodMailed check             0.041133   0.136285   0.302 0.762794
## MonthlyCharges                       -0.024963   0.037454  -0.666 0.505098
##                                         
## (Intercept)                             
## genderMale                              
## SeniorCitizen1                       ** 
## PartnerYes                              
## DependentsYes                           
## tenure2                              ***
## tenure3                              ***
## PhoneServiceYes                         
## MultipleLinesYes                        
## InternetServiceFiber optic              
## InternetServiceNo                    .  
## OnlineSecurityYes                       
## OnlineBackupYes                         
## DeviceProtectionYes                     
## TechSupportYes                          
## StreamingTVYes                          
## StreamingMoviesYes                      
## ContractOne year                     ***
## ContractTwo year                     ***
## PaperlessBillingYes                  ***
## PaymentMethodCredit card (automatic)    
## PaymentMethodElectronic check        ***
## PaymentMethodMailed check               
## MonthlyCharges                          
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 5681.1  on 4921  degrees of freedom
## Residual deviance: 4168.8  on 4898  degrees of freedom
## AIC: 4216.8
## 
## Number of Fisher Scoring iterations: 6
#analyze the deviance table with anova
anova(LogModel, test="Chisq")
## Analysis of Deviance Table
## 
## Model: binomial, link: logit
## 
## Response: Churn
## 
## Terms added sequentially (first to last)
## 
## 
##                  Df Deviance Resid. Df Resid. Dev  Pr(>Chi)    
## NULL                              4921     5681.1              
## gender            1     0.01      4920     5681.1  0.907052    
## SeniorCitizen     1   126.93      4919     5554.2 < 2.2e-16 ***
## Partner           1   101.04      4918     5453.1 < 2.2e-16 ***
## Dependents        1    27.79      4917     5425.3 1.353e-07 ***
## tenure            2   419.20      4915     5006.1 < 2.2e-16 ***
## PhoneService      1     1.30      4914     5004.8  0.254274    
## MultipleLines     1    88.59      4913     4916.2 < 2.2e-16 ***
## InternetService   2   481.20      4911     4435.1 < 2.2e-16 ***
## OnlineSecurity    1    56.60      4910     4378.4 5.335e-14 ***
## OnlineBackup      1     6.18      4909     4372.3  0.012917 *  
## DeviceProtection  1     2.63      4908     4369.6  0.104753    
## TechSupport       1    38.34      4907     4331.3 5.953e-10 ***
## StreamingTV       1     7.77      4906     4323.5  0.005318 ** 
## StreamingMovies   1     7.86      4905     4315.7  0.005066 ** 
## Contract          2   102.76      4903     4212.9 < 2.2e-16 ***
## PaperlessBilling  1    17.18      4902     4195.7 3.407e-05 ***
## PaymentMethod     3    26.52      4899     4169.2 7.409e-06 ***
## MonthlyCharges    1     0.44      4898     4168.8  0.504985    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#create prediction model
predict_glm <- predict(LogModel, newdata = test[, -19], type = 'response')


test$Churn <- as.character(test$Churn)
test$Churn[test$Churn=="No"] <- "0"
test$Churn[test$Churn=="Yes"] <- "1"


#set the predicted value of churn by threshold = 0.5
predict_glm <- ifelse(predict_glm >0.5, 1, 0)

# get the accuracy of the model
misClasificError <- mean(predict_glm != test$Churn)
print(paste('Logistic Regression Accuracy',1-misClasificError))
## [1] "Logistic Regression Accuracy 0.801895734597156"
# Confusion matrix
table(test$Churn, predict_glm)
##    predict_glm
##        0    1
##   0 1386  154
##   1  264  306

Second Model: Decision Tree

#way 1: with library "party"
library("party")
## Warning: package 'party' was built under R version 3.4.4
## Loading required package: grid
## Loading required package: mvtnorm
## Loading required package: modeltools
## Loading required package: stats4
## Loading required package: strucchange
## Warning: package 'strucchange' was built under R version 3.4.4
## Loading required package: zoo
## 
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
## Loading required package: sandwich
## Warning: package 'sandwich' was built under R version 3.4.4
## 
## Attaching package: 'strucchange'
## The following object is masked from 'package:stringr':
## 
##     boundary
tree <- ctree(Churn~Contract+tenure+PaperlessBilling, train)
plot(tree, type='simple')

# Make prediction on Test dataset
pred_tree <- predict(tree, test)

#Confusion Matrix for Decision Tree
table(Predicted = pred_tree, Actual = test$Churn)
##          Actual
## Predicted    0    1
##         0 1308  267
##         1  232  303
p1 <- predict(tree, train)
tab1 <- table(Predicted = p1, Actual = train$Churn)
tab2 <- table(Predicted = pred_tree, Actual = test$Churn)

#Accuracy of Decision tree model on Train dataset
(tab1[1,1] + tab1[2,2]) / sum(tab1)
## [1] 0.756603
#Accuracy of Decision tree model on Test dataset
(tab2[1,1] + tab2[2,2]) / sum(tab2)
## [1] 0.7635071
# way2: with library "rpart"
library(rpart)
library("rpart.plot")
tree2 <- rpart(data=train, Churn ~ Contract+tenure+PaperlessBilling, method='class')

rpart.plot(tree2)

Third Model: Random Forrest

library("randomForest")
## Warning: package 'randomForest' was built under R version 3.4.4
## randomForest 4.6-14
## Type rfNews() to see new features/changes/bug fixes.
## 
## Attaching package: 'randomForest'
## The following object is masked from 'package:dplyr':
## 
##     combine
## The following object is masked from 'package:ggplot2':
## 
##     margin
#create model with training data and see the confusion matrix 
rfModel <- randomForest(Churn ~., data = train)
print(rfModel)
## 
## Call:
##  randomForest(formula = Churn ~ ., data = train) 
##                Type of random forest: classification
##                      Number of trees: 500
## No. of variables tried at each split: 4
## 
##         OOB estimate of  error rate: 21.68%
## Confusion matrix:
##      0   1 class.error
## 0 3232 391   0.1079216
## 1  676 623   0.5204003
# predict churn on the test dataset with the created model 
pred_rf <- predict(rfModel, test)

# check confusion matrix
table(pred_rf, as.factor(test$Churn))
##        
## pred_rf    0    1
##       0 1395  298
##       1  145  272
# plot model to check error dynamics for the different number of trees
plot(rfModel)

varImpPlot(rfModel, sort=T, n.var = 10, main = 'Top 10 Feature Importance')