Automate loan approval process based on customer data with R programming
Case description
This case study will discuss ways to take credit decisions whether to sanction a loan or not based on the customers data using R programming.
Data / Variables. This case study uses data from analyticsvidhya.com
Data | |
Variable | Description |
Loan_ID | Unique Loan ID |
Gender | Male/ Female |
Married | Applicant married (Y/N) |
Dependents | Number of dependents |
Education | Applicant Education (Graduate/ Under Graduate) |
Self_Employed | Self employed (Y/N) |
ApplicantIncome | Applicant income |
CoapplicantIncome | Coapplicant income |
LoanAmount | Loan amount in thousands |
Loan_Amount_Term | Term of loan in months |
Credit_History | credit history meets guidelines |
Property_Area | Urban/ Semi Urban/ Rural |
Loan_Status | Loan approved (Y/N) |
Problem – Whether to approve loan or not (Y / N)
Download dataset to practice
Train
Test
Solution with R codes
1. Download and save the above dataset on your desktop.
2. Create a project in R
3. Link the saved dataset and set up the working directory
R codes
Read the data into R
#read.csv coerces strings to factors
train <- read.csv("train.csv", header=TRUE)
test <- read.csv("test.csv", header=TRUE)
# Print train and test to the console
train
test
View(train)
View(test)
# Exploring the variables in the dataset
head(train)
head(test)
# Exploring the data types in the data set
str(train)
str(test)
# Exploratory data anaysis
# Converting categorical variables into factors using command as.factor
# convert all int variable into numeric as.numeric
train$Gender <- as.factor(train$Gender)
train$Married <- as.factor(train$Married)
train$Dependents <- as.factor(train$Dependents)
train$Education <- as.factor(train$Education)
train$Self_Employed <- as.factor(train$Self_Employed)
train$ApplicantIncome <- as.numeric(train$ApplicantIncome)
train$CoapplicantIncome <- as.numeric(train$CoapplicantIncome)
train$LoanAmount <- as.numeric(train$LoanAmount)
train$Loan_Amount_Term <- as.numeric(train$Loan_Amount_Term)
train$Credit_History <- as.factor(train$Credit_History)
train$Property_Area <- as.factor(train$Property_Area)
train$Loan_Status <- as.factor(train$Loan_Status)
# Frequency tables for categorical variables
table(train$Gender)
table(train$Married)
table(train$Dependents)
table(train$Education)
table(train$Self_Employed)
table(train$Credit_History)
table(train$Property_Area)
# approval rate in numbers
table(train$Loan_Status)
# descriptive statistics for continous variables
summary(train$ApplicantIncome)
mean(train$ApplicantIncome)
sd(train$ApplicantIncome)
hist(train$ApplicantIncome)
boxplot(train$ApplicantIncome)
summary(train$CoapplicantIncome)
mean(train$CoapplicantIncome)
sd(train$CoapplicantIncome)
summary(train$LoanAmount)
mean(train$LoanAmount) ## This will give NA, include na.rm = TRUE to deal with this
mean(train$LoanAmount, na.rm = TRUE)
sd(train$LoanAmount, na.rm = TRUE)
summary(train$Loan_Amount_Term)
mean(train$Loan_Amount_Term, na.rm = TRUE)
sd(train$Loan_Amount_Term, na.rm = TRUE)
# for test data
test$Gender <- as.factor(test$Gender)
test$Married <- as.factor(test$Married)
test$Dependents <- as.factor(test$Dependents)
test$Education <- as.factor(test$Education)
test$Self_Employed <- as.factor(test$Self_Employed)
test$Credit_History <- as.factor(test$Credit_History)
test$Property_Area <- as.factor(test$Property_Area)
test$ApplicantIncome <- as.numeric(test$ApplicantIncome)
test$CoapplicantIncome <- as.numeric(test$CoapplicantIncome)
test$LoanAmount <- as.numeric(test$LoanAmount)
test$Loan_Amount_Term <- as.numeric(test$Loan_Amount_Term)
## Descriptive statistics for test data
table(test$Married)
table(test$Gender)
table(test$Dependents)
table(test$Education)
table(test$Self_Employed)
table(test$Credit_History)
table(test$Property_Area)
# descriptive statistics for continous variables
summary(test$ApplicantIncome)
mean(test$ApplicantIncome)
sd(test$ApplicantIncome)
summary(test$CoapplicantIncome)
mean(test$CoapplicantIncome)
sd(test$CoapplicantIncome)
summary(test$LoanAmount)
mean(test$LoanAmount, na.rm = TRUE)
sd(test$LoanAmount, na.rm = TRUE)
summary(test$Loan_Amount_Term)
mean(test$Loan_Amount_Term, na.rm = TRUE)
sd(test$Loan_Amount_Term, na.rm = TRUE)
# missing values analysis
# the dataset has space as missing value rather than NA value. R doesn't consider space as missing values.
# Therefore we have to give command to R that 'space' is a missing value in the dataset
# Thereforea replace 'space' with NA
train$Gender[train$Gender==""] <- NA
# Check the frequency of missing values
table(is.na(train$Gender))
# TRUE indicates number of missing values
train$Married[train$Married==""] <- NA
table(is.na(train$Married))
train$Dependents[train$Dependents==""] <- NA
table(is.na(train$Dependents))
train$Education[train$Education==""] <- NA
table(is.na(train$Education))
train$Self_Employed[train$Self_Employed==""] <- NA
table(is.na(train$Self_Employed))
train$ApplicantIncome[train$ApplicantIncome==""] <- NA
table(is.na(train$ApplicantIncome))
train$CoapplicantIncome[train$CoapplicantIncome==""] <- NA
table(is.na(train$CoapplicantIncome))
train$LoanAmount[train$LoanAmount==""] <- NA
table(is.na(train$LoanAmount))
train$Loan_Amount_Term[train$Loan_Amount_Term==""] <- NA
table(is.na(train$Loan_Amount_Term))
train$Credit_History[train$Credit_History==""] <- NA
table(is.na(train$Credit_History))
train$Property_Area[train$Property_Area==""] <- NA
table(is.na(train$Property_Area))
## Missing values for test data
# missing values analysis
# the dataset has space as missing value rather than NA value. R doesn't consider space as missing values.
# Therefore we have to give command to R that 'space' is a missing value in the dataset
# Thereforea replace 'space' with NA
test$Gender[test$Gender==""] <- NA
# Check the frequency of missing values
table(is.na(test$Gender))
# TRUE indicates number of missing values
test$Married[test$Married==""] <- NA
table(is.na(test$Married))
test$Dependents[test$Dependents==""] <- NA
table(is.na(test$Dependents))
test$Education[test$Education==""] <- NA
table(is.na(test$Education))
test$Self_Employed[test$Self_Employed==""] <- NA
table(is.na(test$Self_Employed))
test$ApplicantIncome[test$ApplicantIncome==""] <- NA
table(is.na(test$ApplicantIncome))
test$CoapplicantIncome[test$CoapplicantIncome==""] <- NA
table(is.na(test$CoapplicantIncome))
test$LoanAmount[test$LoanAmount==""] <- NA
table(is.na(test$LoanAmount))
test$Loan_Amount_Term[test$Loan_Amount_Term==""] <- NA
table(is.na(test$Loan_Amount_Term))
test$Credit_History[test$Credit_History==""] <- NA
table(is.na(test$Credit_History))
test$Property_Area[test$Property_Area==""] <- NA
table(is.na(test$Property_Area))
## Handling missing values
# The first option is to delete / drop the missing values from the analysis
train1 <- na.omit(train)
str(train1)
test1 <- na.omit(test)
str(test1)
# Since we have less observation we will not drop drop the missing values
## Missing value imputation
# From the above analysis, we got to know that following variables have missing values
# Gender, Married, Dependents, Self_employed, LoanAmount, Loan_Amount_Term, Credit_History
train$Gender[is.na(train$Gender)] <- "Male"
train$Married[is.na(train$Married)] <- "Yes"
train$Dependents[is.na(train$Dependents)] <- "0"
train$Self_Employed[is.na(train$Dependents)] <- "0"
train$Self_Employed[is.na(train$Self_Employed)] <- "No"
train$Credit_History[is.na(train$Credit_History)] <- "1"
train$LoanAmount[is.na(train$LoanAmount)] <- mean(train$LoanAmount, na.rm = TRUE)
train$Loan_Amount_Term[is.na(train$Loan_Amount_Term)] <- mean(train$Loan_Amount_Term, na.rm = TRUE)
# For test data
test$Gender[is.na(test$Gender)] <- "Male"
test$Dependents[is.na(test$Dependents)] <- "0"
test$Self_Employed[is.na(test$Self_Employed)] <- "No"
test$Credit_History[is.na(test$Credit_History)] <- "1"
test$LoanAmount[is.na(test$LoanAmount)] <- mean(test$LoanAmount, na.rm = TRUE)
test$Loan_Amount_Term[is.na(test$Loan_Amount_Term)] <- mean(test$Loan_Amount_Term, na.rm = TRUE)
# approval rate in proportion
prop.table(table(train$Loan_Status))
## Cross tabulation
library(gmodels)
CrossTable(train$Gender, train$Loan_Status, chisq = TRUE)
chisq.test(train$Gender, train$Loan_Status)