--- title: "Business Data Analytics. Practice Session" subtitle: "Customer Lifecycle Management: Churn Prediction" output: prettydoc::html_pretty: null highlight: github html_document: default html_notebook: default github_document: default theme: cayman --- ##Step 1: Data loading ```{r} library('tidyverse') library('data.table') # Load the dataset dt <- read.csv(file.choose()) dt <- as.data.frame(dt) # check the dataset str(dt) #check the data type of each column sapply(dt, class) ``` ##Step 2: Data wrangling and transformation ```{r} # check the columns with NA values sapply(dt, function(x) sum(is.na(x))) # remove the rows with NA values dt <- dt[complete.cases(dt) == TRUE, ] # check the unique calues for each column for (col in names(dt)){ if (class(dt[[col]]) %in% c("character", "factor") & col != 'customerID'){ print(unique(dt[col])) } } #need to change "No internet service" to "No" for "OnlineSecurity", #"OnlineBackup", "DeviceProtection", "TechSupport", "streamingTV", "streamingMovies". for(i in c(10:15)) { print(names(dt[i])) } for(i in c(10:15)) { dt[i][dt[i]=="No internet service"] <- "No" } # change "No phone service" to "No" for MultipleLines dt$MultipleLines[dt$MultipleLines == "No phone service"] <- "No" #remove CustomerID column dt$customerID <- NULL #check tenure min(dt$tenure); max(dt$tenure) #transform tenure to dt$tenure <- as.factor(ifelse(dt$tenure == 0, '0', ifelse(dt$tenure <= 24, '1', ifelse(dt$tenure <= 48, '2', '3' )))) #tranform Churn values from Yes-no to 1-0 dt$Churn <- as.factor(ifelse(dt$Churn == "Yes", 1, 0)) #change SeniorCitizen type to factor dt$SeniorCitizen <- as.factor(dt$SeniorCitizen) # transform the needed columns to factor data type for (col in names(dt)){ if (class(dt[[col]]) == "character"){ dt <- dt%>% mutate_at(col, funs(factor(.))) } } #check the data type of each column again str(dt) ``` ## Step 3: Preparing test and training data ```{r} #Check the numeric variables and correlation between them cols <- sapply(dt, is.numeric) cor(dt%>%select(MonthlyCharges, TotalCharges)) dt$TotalCharges <- NULL #generate random sample set.seed(999) sample <- sample.int(n = nrow(dt), size = floor(0.7*nrow(dt)), replace = F) # split data to train and test datasets train <- dt[sample, ] test <- dt[-sample, ] dim(train); dim(test) ``` ##First Model: General linear model ```{r} #create logistic regression. LogModel <- glm(Churn ~ .,family=binomial(link="logit"),data=train) print(summary(LogModel)) #analyze the deviance table with anova anova(LogModel, test="Chisq") #create prediction model predict_glm <- predict(LogModel, newdata = test[, -19], type = 'response') test$Churn <- as.character(test$Churn) test$Churn[test$Churn=="No"] <- "0" test$Churn[test$Churn=="Yes"] <- "1" #set the predicted value of churn by threshold = 0.5 predict_glm <- ifelse(predict_glm >0.5, 1, 0) # get the accuracy of the model misClasificError <- mean(predict_glm != test$Churn) print(paste('Logistic Regression Accuracy',1-misClasificError)) # Confusion matrix table(test$Churn, predict_glm) ``` ## Second Model: Decision Tree ```{r} #way 1: with library "party" library("party") tree <- ctree(Churn~Contract+tenure+PaperlessBilling, train) plot(tree, type='simple') # Make prediction on Test dataset pred_tree <- predict(tree, test) #Confusion Matrix for Decision Tree table(Predicted = pred_tree, Actual = test$Churn) p1 <- predict(tree, train) tab1 <- table(Predicted = p1, Actual = train$Churn) tab2 <- table(Predicted = pred_tree, Actual = test$Churn) #Accuracy of Decision tree model on Train dataset (tab1[1,1] + tab1[2,2]) / sum(tab1) #Accuracy of Decision tree model on Test dataset (tab2[1,1] + tab2[2,2]) / sum(tab2) # way2: with library "rpart" library(rpart) library("rpart.plot") tree2 <- rpart(data=train, Churn ~ Contract+tenure+PaperlessBilling, method='class') rpart.plot(tree2) ``` ## Third Model: Random Forrest ```{r} library("randomForest") #create model with training data and see the confusion matrix rfModel <- randomForest(Churn ~., data = train) print(rfModel) # predict churn on the test dataset with the created model pred_rf <- predict(rfModel, test) # check confusion matrix table(pred_rf, as.factor(test$Churn)) # plot model to check error dynamics for the different number of trees plot(rfModel) varImpPlot(rfModel, sort=T, n.var = 10, main = 'Top 10 Feature Importance') ```