Predicting survivors on titanic ship using machine learning
Case description
Problem – Predicting survivors on titanic ship using machine learning.
This case study is part of the Kaggle competition. You can make the submission of your prediction on Kaggle.
This data is also available at https://www.kaggle.com/c/titanic/data
Data / Variables description
Variable | Description | Category |
survival | Survival | 0 = No, 1 = Yes |
pclass | Ticket class | 1 = 1st, 2 = 2nd, 3 = 3rd |
sex | Sex | |
Age | Age in years | |
sibsp | # of siblings / spouses aboard the Titanic | |
parch | # of parents / children aboard the Titanic | |
ticket | Ticket number | |
fare | Passenger fare | |
cabin | Cabin number | |
embarked | Port of Embarkation | C = Cherbourg, Q = Queenstown, S = Southampton |
Download dataset to practice
Solution with R codes
1. Download and save the above dataset (Train and Test) on your desktop.
2. Create a project in RStudio
If you are new to RStudio, learn how to create project in RStudio
https://datasciencevidhya.com/post/how-to-create-a-project-in-r-studio
3. Link the saved dataset and set up the working directory
R codes
Data Exploration
#Reading the CSV file or loading the raw data
train <- read.csv("train.csv", header=TRUE)
test <- read.csv("test.csv", header=TRUE)
# Print train and test to the console
train
test
View(train)
View(test)
head(train)
head(test)
# Exploring the data types in the data set
str(train)
str(test)
Missing value analysisis.na(train$Pclass) # this command returns TRUE if there is a missing value otherwise false
# storing these value in a dataframe
table(is.na(train$Pclass))
missing_Pclass <- data.frame (is.na(train$Pclass))
View(missing_Pclass)
table(missing_Pclass) # no missing values
# Do this for other variable as well
missing_train_Sex <- data.frame(is.na(train$Sex))
missing_train_Age <- data.frame(is.na(train$Age))
missing_train_Sibsp <- data.frame(is.na(train$SibSp))
missing_train_Parch <- data.frame(is.na(train$Parch))
missing_train_Ticket <- data.frame(is.na(train$Ticket))
missing_train_Fare <- data.frame(is.na(train$Fare))
missing_train_Cabin <- data.frame(is.na(train$Cabin))
missing_train_Embarked <- data.frame(is.na(train$Embarked))
#Tabulate missing values
table(missing_train_Sex)
table(missing_train_Age)
table(missing_train_Sibsp)
table(missing_train_Parch)
table(missing_train_Ticket)
table(missing_train_Fare)
table(missing_train_Cabin)
table(missing_train_Embarked)
table(train$Embarked)
train$Embarked
#For test data
missing_test_Sex <- data.frame(is.na(test$Sex))
missing_test_Age <- data.frame(is.na(test$Age))
missing_test_Sibsp <- data.frame(is.na(test$SibSp))
missing_test_Parch <- data.frame(is.na(test$Parch))
missing_test_Ticket <- data.frame(is.na(test$Ticket))
missing_test_Fare <- data.frame(is.na(test$Fare))
missing_test_Cabin <- data.frame(is.na(test$Cabin))
missing_test_Embarked <- data.frame(is.na(test$Embarked))
#Tabulate missing values
table(missing_test_Sex)
table(missing_test_Age)
table(missing_test_Sibsp)
table(missing_test_Parch)
table(missing_test_Ticket)
table(missing_test_Fare)
table(missing_test_Cabin)
table(missing_test_Embarked)
table(is.na(train$Pclass))
table(is.na(train$Sex))
table(is.na(train$Age))
table(is.na(train$SibSp))
table(is.na(train$Parch))
table(is.na(train$Ticket))
table(is.na(train$Fare))
table(is.na(train$Cabin))
table(is.na(train$Embarked))
Exploratroy data analysis
Frequency tables and descriptive statistics
# converting Pclass into factor
train$Pclass <- as.factor(train$Pclass)
table(train$Pclass)
train$Sex <- as.factor(train$Sex)
table(train$Sex)
train$SibSp <- as.factor(train$SibSp)
table(train$SibSp)
train$Parch <- as.factor(train$Parch)
table(train$Parch)
train$Embarked <- as.factor(train$Embarked)
table(train$Embarked)
train$Survived <- as.factor(train$Survived)
table(train$Survived)
#test data
test$Pclass <- as.factor(test$Pclass)
table(test$Pclass)
test$Sex <- as.factor(test$Sex)
table(test$Sex)
test$SibSp <- as.factor(test$SibSp)
table(test$SibSp)
test$Parch <- as.factor(test$Parch)
table(test$Parch)
test$Embarked <- as.factor(test$Embarked)
table(test$Embarked)
# descriptive statistics for continous variable (train data)
summary(train$Age)
sd(train$Age, na.rm=TRUE)
mean(train$Age, na.rm=TRUE)
# descriptive statistics for continous variable(test data)
summary(test$Age)
sd(test$Age, na.rm=TRUE)
mean(test$Age, na.rm=TRUE)
# Replacing the missing values with mean
train$Age[is.na(train$Age)] <- mean (train$Age, na.rm=TRUE)
# Replacing the missing values with mean
test$Age[is.na(test$Age)] <- mean (test$Age, na.rm=TRUE)
## Add survived column for prediction
test$Survived <- NA
test$Survived <- factor(test$Survived, levels = c("1", "0") )
levels(test$Survived)
levels(train$Survived) <- levels(test$Survived)
# Survival rates in absolute numbers
table(train$Survived)
# As proportions
prop.table(table(train$Survived))
Bivariate relationships
# Two-way comparison: Sex and Survived
table(train$Sex, train$Survived)
# Two-way comparison: row-wise proportions
prop.table(table(train$Sex, train$Survived))
Make prediction using Zero classification model
Either everyone survives or everyone dies
# Prediction 1 - everyone dies
everyone_dies <- test
everyone_dies$Survived <- 0
View(everyone_dies)
#submit a csv file with the PassengerId as well as our Survived predictions to Kaggle
submit <- data.frame(PassengerId = everyone_dies$PassengerId, Survived = everyone_dies$Survived)
write.csv(submit, file = "everyone_dies.csv", row.names = FALSE)
# Prediction 2 - Everyone survives
everyonesurvives <-test
everyonesurvives$Survived <- 1
View(everyonesurvives)
#submit a csv file with the PassengerId as well as our Survived predictions to Kaggle
submit <- data.frame(PassengerId = everyonesurvives$PassengerId, Survived = everyonesurvives$Survived)
write.csv(submit, file = "everyone_survives.csv", row.names = FALSE)
# Prediction 3 - All female survived / Was women and children first rule followed?
# Let us try to understand the relationship between survival and gender by doing cross tabulation
# Cross tabs
install.packages("gmodels")
library(gmodels)
CrossTable (train$Survived, train$Sex)
Allfemalesurvived <- test
# Initialize a Survived column to 0
Allfemalesurvived$Survived <- 0
# Set Survived to 1 if Sex equals "female"
Allfemalesurvived$Survived[Allfemalesurvived$Sex== "female"] <-1
#import the .CSV file
submit <- data.frame(PassengerId = Allfemalesurvived$PassengerId, Survived = Allfemalesurvived$Survived)
write.csv(submit, file = "femalessurvived.csv", row.names = FALSE)
Supervised Machine learning Algorithms for predictive modeling.
Logistic regression
Train the model using the training sets and check score
logistic <- glm(formula = Survived ~ Pclass + Sex + Embarked + Age, data = train, family ="binomial")
summary(logistic)
# Let us make prediction using logistic model
my_log_prediction <- predict(logistic, test, type = "response")
# Suppose you wish to calculate probability
submit <- data.frame(PassengerId = test$PassengerId, Survived = my_log_prediction)
write.csv(submit, file = "logistic_prob.csv", row.names = FALSE)
# Suppose you wish to calculate classifcation
predicted.classes <- ifelse(predicted > 0.5, "1", "0")
table(predicted.classes)
submit <- data.frame(PassengerId = test$PassengerId, Survived = predicted.classes)
write.csv(submit, file = "logistic_class.csv", row.names = FALSE)
submit <- data.frame(PassengerId = test$PassengerId, Survived = my_log_prediction, Classification = predicted.classes)
write.csv(submit, file = "logistic_class_prob.csv", row.names = FALSE)
Decision Trees
CART decision tree algorithm
library(rpart)
install.packages('rattle')
install.packages('rpart.plot')
install.packages('RColorBrewer')
library(rattle)
library(rpart.plot)
fit <- rpart(Survived ~ Pclass + Sex + Age + SibSp + Embarked, data=train, method="class")
# fit is our decision tree model
# let us visualize the tree
plot(fit)
text(fit)
fancyRpartPlot(fit)
# Let us make the prediction using decision tree model
Prediction <- predict(fit, test, type = "class")
submit <- data.frame(PassengerId = test$PassengerId, Survived = Prediction)
write.csv(submit, file = "myfirstdtree.csv", row.names = FALSE)