Predictive model for Bank customer churn using machine learning
This case study study is about predicting customer churn in Banks. Please download the dataset given below.
This dataset contains following variables.
RowNumber
CustomerId
Surname
CreditScore
Geography
Gender
Age
Tenure
Balance
NumOfProducts – How many accounts, bank account affiliated products the person has
HasCrCard – IsActiveMember
EstimatedSalary
Exited
- Did they leave the bank after all?
Import and read the data in R
Solution
## Customer churn model
## Import data
Churn_Modelling <- read.csv("D:/ATEEQUE_NIBM/0.0 NIBM/2.0 TEACHING/1.0 MBA/2.0 Marketing research and analytics/4.1 Workshop on customer analytics in Banking/6.0 Hands on exercise_customer churn/2.0 Working data/Churn_Modelling.csv")
## View data
View(Churn_Modelling)
## Rename to dataset
dataset <- Churn_Modelling
## Check the structure
str(dataset)
## convert to factors
dataset$RowNumber <- as.factor(dataset$RowNumber)
dataset$CustomerId <- as.factor(dataset$CustomerId)
dataset$Surname <- as.factor(dataset$Surname)
dataset$Geography <- as.factor(dataset$Geography)
dataset$Tenure <- as.factor(dataset$Tenure)
dataset$HasCrCard <- as.factor(dataset$HasCrCard)
dataset$IsActiveMember <- as.factor(dataset$IsActiveMember)
dataset$Exited <- as.factor(dataset$Exited)
## Exploratory data analysis
## Frequency analysis
table(dataset$Geography)
table(dataset$HasCrCard)
table(dataset$IsActiveMember)
table(dataset$Tenure)
table(dataset$Exited)
## Descriptive analytics
summary(dataset$CreditScore)
summary(dataset$Age)
summary(dataset$Balance)
summary(dataset$NumOfProducts)
summary(dataset$EstimatedSalary)
sd(dataset$CreditScore)
sd(dataset$Age)
sd(dataset$Balance)
sd(dataset$NumOfProducts)
sd(dataset$EstimatedSalary)
boxplot(dataset$CreditScore)
boxplot(dataset$Age)
boxplot(dataset$Balance)
boxplot(dataset$NumOfProducts)
boxplot(dataset$EstimatedSalary)
hist(dataset$CreditScore)
hist(dataset$Age)
hist(dataset$Balance)
hist(dataset$NumOfProducts)
hist(dataset$EstimatedSalary)
## Missing data analysis
table(is.na(dataset$Geography))
table(is.na(dataset$HasCrCard))
table(is.na(dataset$IsActiveMember))
table(is.na(dataset$Exited))
table(is.na(dataset$CreditScore))
table(is.na(dataset$Age))
table(is.na(dataset$Tenure))
table(is.na(dataset$Balance))
table(is.na(dataset$NumOfProducts))
table(is.na(dataset$EstimatedSalary))
##Creating training and validation (test) from the given dataset
# converting "yes" "no" response values to 1, 0 values for
# Splitting the dataset into the Training set and Test set
#partition data
dataset$Exited <- as.factor(dataset$Exited)
set.seed(90)
pd <- sample(2,nrow(dataset),replace=TRUE,prob = c(0.7, 0.3))
train <- dataset[pd==1,] # means All columns
test <- dataset[pd==2,]
View(train)
View(test)
str(train)
str(test)
table(train$Exited)
table(test$Exited)
#Use the function prop.table()combined with table() to verify if the randomization process is correct.
prop.table(table(train$Exited))
prop.table(table(test$Exited))
## Logicstic regression
# Train the model using the training sets and check score
logistic_churn <- glm(formula = Exited ~ ., data = train, family ="binomial")
summary(logistic_churn)
my_log_prediction <- predict(logistic_tmc, newdata = test, type = "response")
View(my_log_prediction)
submit <- data.frame(CustomerID = test$CustomerID, y_actual = test$y, y_prob = my_log_prediction)
write.csv(submit, file = "tmc_logistic.csv", row.names = FALSE)
# Now we wish to classify based on probability
# Suppose you wish to calculate classifcation
predicted.classes <- ifelse(my_log_prediction > 0.5, "1", "0")
table(predicted.classes)
submit <- data.frame(CustomerID = test$CustomerID, y_actual = test$y, y_prob = my_log_prediction, y_class = predicted.classes)
View(submit)
str(submit)
write.csv(submit, file = "tmc_logistic1.csv", row.names = FALSE)
#missclasification error for test data sample
classification <- table(submit$y_actual,submit$y_class)
classification
accuracy_Test <- sum(diag(classification )) / sum(classification)
accuracy_Test
##Missclassfication
1-sum(diag(classification)/sum(classification))