Automate loan approval process based on customer data with R programming
Case description
This case study will discuss ways to take credit decisions whether to sanction a loan or not based on the customers data using R programming.
Data / Variables. This case study uses data from analyticsvidhya.com
| Data | |
| Variable | Description |
| Loan_ID | Unique Loan ID |
| Gender | Male/ Female |
| Married | Applicant married (Y/N) |
| Dependents | Number of dependents |
| Education | Applicant Education (Graduate/ Under Graduate) |
| Self_Employed | Self employed (Y/N) |
| ApplicantIncome | Applicant income |
| CoapplicantIncome | Coapplicant income |
| LoanAmount | Loan amount in thousands |
| Loan_Amount_Term | Term of loan in months |
| Credit_History | credit history meets guidelines |
| Property_Area | Urban/ Semi Urban/ Rural |
| Loan_Status | Loan approved (Y/N) |
Problem – Whether to approve loan or not (Y / N)
Download dataset to practice
Train
Test
Solution with R codes
1. Download and save the above dataset on your desktop.
2. Create a project in R
3. Link the saved dataset and set up the working directory
R codes
Read the data into R
#read.csv coerces strings to factorstrain <- read.csv("train.csv", header=TRUE)test <- read.csv("test.csv", header=TRUE)
# Print train and test to the consoletraintest
View(train)View(test)
# Exploring the variables in the datasethead(train)head(test)
# Exploring the data types in the data setstr(train)str(test)
# Exploratory data anaysis# Converting categorical variables into factors using command as.factor# convert all int variable into numeric as.numeric
train$Gender <- as.factor(train$Gender)train$Married <- as.factor(train$Married)train$Dependents <- as.factor(train$Dependents)train$Education <- as.factor(train$Education)train$Self_Employed <- as.factor(train$Self_Employed)train$ApplicantIncome <- as.numeric(train$ApplicantIncome)train$CoapplicantIncome <- as.numeric(train$CoapplicantIncome)train$LoanAmount <- as.numeric(train$LoanAmount)train$Loan_Amount_Term <- as.numeric(train$Loan_Amount_Term)train$Credit_History <- as.factor(train$Credit_History)train$Property_Area <- as.factor(train$Property_Area)train$Loan_Status <- as.factor(train$Loan_Status)
# Frequency tables for categorical variables
table(train$Gender)table(train$Married)table(train$Dependents)table(train$Education)table(train$Self_Employed)table(train$Credit_History)table(train$Property_Area)
# approval rate in numberstable(train$Loan_Status)
# descriptive statistics for continous variablessummary(train$ApplicantIncome)mean(train$ApplicantIncome)sd(train$ApplicantIncome)
hist(train$ApplicantIncome)boxplot(train$ApplicantIncome)
summary(train$CoapplicantIncome)mean(train$CoapplicantIncome)sd(train$CoapplicantIncome)
summary(train$LoanAmount)mean(train$LoanAmount) ## This will give NA, include na.rm = TRUE to deal with thismean(train$LoanAmount, na.rm = TRUE)sd(train$LoanAmount, na.rm = TRUE)
summary(train$Loan_Amount_Term)mean(train$Loan_Amount_Term, na.rm = TRUE)sd(train$Loan_Amount_Term, na.rm = TRUE)
# for test data
test$Gender <- as.factor(test$Gender)test$Married <- as.factor(test$Married)test$Dependents <- as.factor(test$Dependents)test$Education <- as.factor(test$Education)test$Self_Employed <- as.factor(test$Self_Employed)test$Credit_History <- as.factor(test$Credit_History)test$Property_Area <- as.factor(test$Property_Area)test$ApplicantIncome <- as.numeric(test$ApplicantIncome)test$CoapplicantIncome <- as.numeric(test$CoapplicantIncome)test$LoanAmount <- as.numeric(test$LoanAmount)test$Loan_Amount_Term <- as.numeric(test$Loan_Amount_Term)
## Descriptive statistics for test data
table(test$Married)table(test$Gender)table(test$Dependents)table(test$Education)table(test$Self_Employed)table(test$Credit_History)table(test$Property_Area)
# descriptive statistics for continous variables
summary(test$ApplicantIncome)mean(test$ApplicantIncome)sd(test$ApplicantIncome)
summary(test$CoapplicantIncome)mean(test$CoapplicantIncome)sd(test$CoapplicantIncome)
summary(test$LoanAmount)mean(test$LoanAmount, na.rm = TRUE)sd(test$LoanAmount, na.rm = TRUE)
summary(test$Loan_Amount_Term)mean(test$Loan_Amount_Term, na.rm = TRUE)sd(test$Loan_Amount_Term, na.rm = TRUE)
# missing values analysis# the dataset has space as missing value rather than NA value. R doesn't consider space as missing values.# Therefore we have to give command to R that 'space' is a missing value in the dataset# Thereforea replace 'space' with NA
train$Gender[train$Gender==""] <- NA# Check the frequency of missing valuestable(is.na(train$Gender))
# TRUE indicates number of missing values
train$Married[train$Married==""] <- NAtable(is.na(train$Married))
train$Dependents[train$Dependents==""] <- NAtable(is.na(train$Dependents))
train$Education[train$Education==""] <- NAtable(is.na(train$Education))
train$Self_Employed[train$Self_Employed==""] <- NAtable(is.na(train$Self_Employed))
train$ApplicantIncome[train$ApplicantIncome==""] <- NAtable(is.na(train$ApplicantIncome))
train$CoapplicantIncome[train$CoapplicantIncome==""] <- NAtable(is.na(train$CoapplicantIncome))
train$LoanAmount[train$LoanAmount==""] <- NAtable(is.na(train$LoanAmount))
train$Loan_Amount_Term[train$Loan_Amount_Term==""] <- NAtable(is.na(train$Loan_Amount_Term))
train$Credit_History[train$Credit_History==""] <- NAtable(is.na(train$Credit_History))
train$Property_Area[train$Property_Area==""] <- NAtable(is.na(train$Property_Area))
## Missing values for test data
# missing values analysis# the dataset has space as missing value rather than NA value. R doesn't consider space as missing values.# Therefore we have to give command to R that 'space' is a missing value in the dataset# Thereforea replace 'space' with NA
test$Gender[test$Gender==""] <- NA# Check the frequency of missing valuestable(is.na(test$Gender))
# TRUE indicates number of missing values
test$Married[test$Married==""] <- NAtable(is.na(test$Married))
test$Dependents[test$Dependents==""] <- NAtable(is.na(test$Dependents))
test$Education[test$Education==""] <- NAtable(is.na(test$Education))
test$Self_Employed[test$Self_Employed==""] <- NAtable(is.na(test$Self_Employed))
test$ApplicantIncome[test$ApplicantIncome==""] <- NAtable(is.na(test$ApplicantIncome))
test$CoapplicantIncome[test$CoapplicantIncome==""] <- NAtable(is.na(test$CoapplicantIncome))
test$LoanAmount[test$LoanAmount==""] <- NAtable(is.na(test$LoanAmount))
test$Loan_Amount_Term[test$Loan_Amount_Term==""] <- NAtable(is.na(test$Loan_Amount_Term))
test$Credit_History[test$Credit_History==""] <- NAtable(is.na(test$Credit_History))
test$Property_Area[test$Property_Area==""] <- NAtable(is.na(test$Property_Area))
## Handling missing values# The first option is to delete / drop the missing values from the analysistrain1 <- na.omit(train)str(train1)
test1 <- na.omit(test)str(test1)# Since we have less observation we will not drop drop the missing values
## Missing value imputation# From the above analysis, we got to know that following variables have missing values# Gender, Married, Dependents, Self_employed, LoanAmount, Loan_Amount_Term, Credit_History
train$Gender[is.na(train$Gender)] <- "Male"train$Married[is.na(train$Married)] <- "Yes"train$Dependents[is.na(train$Dependents)] <- "0"train$Self_Employed[is.na(train$Dependents)] <- "0"train$Self_Employed[is.na(train$Self_Employed)] <- "No"train$Credit_History[is.na(train$Credit_History)] <- "1"train$LoanAmount[is.na(train$LoanAmount)] <- mean(train$LoanAmount, na.rm = TRUE)
train$Loan_Amount_Term[is.na(train$Loan_Amount_Term)] <- mean(train$Loan_Amount_Term, na.rm = TRUE)
# For test data
test$Gender[is.na(test$Gender)] <- "Male"test$Dependents[is.na(test$Dependents)] <- "0"test$Self_Employed[is.na(test$Self_Employed)] <- "No"test$Credit_History[is.na(test$Credit_History)] <- "1"test$LoanAmount[is.na(test$LoanAmount)] <- mean(test$LoanAmount, na.rm = TRUE)
test$Loan_Amount_Term[is.na(test$Loan_Amount_Term)] <- mean(test$Loan_Amount_Term, na.rm = TRUE)
# approval rate in proportionprop.table(table(train$Loan_Status))
## Cross tabulationlibrary(gmodels)CrossTable(train$Gender, train$Loan_Status, chisq = TRUE)
chisq.test(train$Gender, train$Loan_Status)