Predicting survivors on titanic ship using machine learning

Case description

Problem – Predicting survivors on titanic ship using machine learning.

This case study is part of the Kaggle competition. You can make the submission of your prediction on Kaggle.

This data is also available at https://www.kaggle.com/c/titanic/data

Data / Variables description

Variable	Description	Category
survival	Survival	0 = No, 1 = Yes
pclass	Ticket class	1 = 1st, 2 = 2nd, 3 = 3rd
sex	Sex
Age	Age in years
sibsp	# of siblings / spouses aboard the Titanic
parch	# of parents / children aboard the Titanic
ticket	Ticket number
fare	Passenger fare
cabin	Cabin number
embarked	Port of Embarkation	C = Cherbourg, Q = Queenstown, S = Southampton

Download dataset to practice

Train

Test

Solution with R codes

1. Download and save the above dataset (Train and Test) on your desktop.

2. Create a project in RStudio

If you are new to RStudio, learn how to create project in RStudio

https://datasciencevidhya.com/post/how-to-create-a-project-in-r-studio

3. Link the saved dataset and set up the working directory

R codes

Data Exploration

#Reading the CSV file or loading the raw data

train <- read.csv("train.csv", header=TRUE)

test <- read.csv("test.csv", header=TRUE)

# Print train and test to the console
train
test

View(train)
View(test)

head(train)
head(test)

# Exploring the data types in the data set
str(train)
str(test)

Missing value analysis

is.na(train$Pclass) # this command returns TRUE if there is a missing value otherwise false
# storing these value in a dataframe

table(is.na(train$Pclass))
missing_Pclass <- data.frame (is.na(train$Pclass))
View(missing_Pclass)
table(missing_Pclass) # no missing values

# Do this for other variable as well
missing_train_Sex <- data.frame(is.na(train$Sex))
missing_train_Age <- data.frame(is.na(train$Age))
missing_train_Sibsp <- data.frame(is.na(train$SibSp))
missing_train_Parch <- data.frame(is.na(train$Parch))
missing_train_Ticket <- data.frame(is.na(train$Ticket))
missing_train_Fare <- data.frame(is.na(train$Fare))
missing_train_Cabin <- data.frame(is.na(train$Cabin))
missing_train_Embarked <- data.frame(is.na(train$Embarked))

#Tabulate missing values

table(missing_train_Sex)
table(missing_train_Age)
table(missing_train_Sibsp)
table(missing_train_Parch)
table(missing_train_Ticket)
table(missing_train_Fare)
table(missing_train_Cabin)
table(missing_train_Embarked)

table(train$Embarked)

train$Embarked

#For test data

missing_test_Sex <- data.frame(is.na(test$Sex))
missing_test_Age <- data.frame(is.na(test$Age))
missing_test_Sibsp <- data.frame(is.na(test$SibSp))
missing_test_Parch <- data.frame(is.na(test$Parch))
missing_test_Ticket <- data.frame(is.na(test$Ticket))
missing_test_Fare <- data.frame(is.na(test$Fare))
missing_test_Cabin <- data.frame(is.na(test$Cabin))
missing_test_Embarked <- data.frame(is.na(test$Embarked))

#Tabulate missing values

table(missing_test_Sex)
table(missing_test_Age)
table(missing_test_Sibsp)
table(missing_test_Parch)
table(missing_test_Ticket)
table(missing_test_Fare)
table(missing_test_Cabin)
table(missing_test_Embarked)

table(is.na(train$Pclass))
table(is.na(train$Sex))
table(is.na(train$Age))
table(is.na(train$SibSp))
table(is.na(train$Parch))
table(is.na(train$Ticket))
table(is.na(train$Fare))
table(is.na(train$Cabin))
table(is.na(train$Embarked))

Exploratroy data analysis

Frequency tables and descriptive statistics

# converting Pclass into factor
train$Pclass <- as.factor(train$Pclass)
table(train$Pclass)

train$Sex <- as.factor(train$Sex)
table(train$Sex)

train$SibSp <- as.factor(train$SibSp)
table(train$SibSp)

train$Parch <- as.factor(train$Parch)
table(train$Parch)

train$Embarked <- as.factor(train$Embarked)
table(train$Embarked)

train$Survived <- as.factor(train$Survived)
table(train$Survived)

#test data
test$Pclass <- as.factor(test$Pclass)
table(test$Pclass)

test$Sex <- as.factor(test$Sex)
table(test$Sex)

test$SibSp <- as.factor(test$SibSp)
table(test$SibSp)

test$Parch <- as.factor(test$Parch)
table(test$Parch)

test$Embarked <- as.factor(test$Embarked)
table(test$Embarked)

# descriptive statistics for continous variable (train data)
summary(train$Age)
sd(train$Age, na.rm=TRUE)
mean(train$Age, na.rm=TRUE)

# descriptive statistics for continous variable(test data)
summary(test$Age)
sd(test$Age, na.rm=TRUE)
mean(test$Age, na.rm=TRUE)

# Replacing the missing values with mean
train$Age[is.na(train$Age)] <- mean (train$Age, na.rm=TRUE)

# Replacing the missing values with mean
test$Age[is.na(test$Age)] <- mean (test$Age, na.rm=TRUE)

## Add survived column for prediction

test$Survived <- NA
test$Survived <- factor(test$Survived, levels = c("1", "0") )
levels(test$Survived)

levels(train$Survived) <- levels(test$Survived)

# Survival rates in absolute numbers
table(train$Survived)

# As proportions
prop.table(table(train$Survived))

Bivariate relationships

# Two-way comparison: Sex and Survived
table(train$Sex, train$Survived)

# Two-way comparison: row-wise proportions

prop.table(table(train$Sex, train$Survived))

Make prediction using Zero classification model

Either everyone survives or everyone dies

# Prediction 1 - everyone dies

everyone_dies <- test
everyone_dies$Survived <- 0

View(everyone_dies)

#submit a csv file with the PassengerId as well as our Survived predictions to Kaggle
submit <- data.frame(PassengerId = everyone_dies$PassengerId, Survived = everyone_dies$Survived)

write.csv(submit, file = "everyone_dies.csv", row.names = FALSE)

# Prediction 2 - Everyone survives

everyonesurvives <-test
everyonesurvives$Survived <- 1
View(everyonesurvives)

#submit a csv file with the PassengerId as well as our Survived predictions to Kaggle
submit <- data.frame(PassengerId = everyonesurvives$PassengerId, Survived = everyonesurvives$Survived)

write.csv(submit, file = "everyone_survives.csv", row.names = FALSE)

# Prediction 3 - All female survived / Was women and children first rule followed?
# Let us try to understand the relationship between survival and gender by doing cross tabulation
# Cross tabs

install.packages("gmodels")
library(gmodels)
CrossTable (train$Survived, train$Sex)

Allfemalesurvived <- test

# Initialize a Survived column to 0
Allfemalesurvived$Survived <- 0

# Set Survived to 1 if Sex equals "female"
Allfemalesurvived$Survived[Allfemalesurvived$Sex== "female"] <-1

#import the .CSV file

submit <- data.frame(PassengerId = Allfemalesurvived$PassengerId, Survived = Allfemalesurvived$Survived)

write.csv(submit, file = "femalessurvived.csv", row.names = FALSE)

Supervised Machine learning Algorithms for predictive modeling.

Logistic regression

Train the model using the training sets and check score

logistic <- glm(formula = Survived ~ Pclass + Sex + Embarked + Age, data = train, family ="binomial")

summary(logistic)

# Let us make prediction using logistic model

my_log_prediction <- predict(logistic, test, type = "response")

# Suppose you wish to calculate probability

submit <- data.frame(PassengerId = test$PassengerId, Survived = my_log_prediction)

write.csv(submit, file = "logistic_prob.csv", row.names = FALSE)

# Suppose you wish to calculate classifcation

predicted.classes <- ifelse(predicted > 0.5, "1", "0")
table(predicted.classes)

submit <- data.frame(PassengerId = test$PassengerId, Survived = predicted.classes)
write.csv(submit, file = "logistic_class.csv", row.names = FALSE)

submit <- data.frame(PassengerId = test$PassengerId, Survived = my_log_prediction, Classification = predicted.classes)
write.csv(submit, file = "logistic_class_prob.csv", row.names = FALSE)

Decision Trees

CART decision tree algorithm
library(rpart)

install.packages('rattle')
install.packages('rpart.plot')
install.packages('RColorBrewer')
library(rattle)
library(rpart.plot)

fit <- rpart(Survived ~ Pclass + Sex + Age + SibSp + Embarked, data=train, method="class")
# fit is our decision tree model
# let us visualize the tree

plot(fit)
text(fit)

fancyRpartPlot(fit)

# Let us make the prediction using decision tree model

Prediction <- predict(fit, test, type = "class")
submit <- data.frame(PassengerId = test$PassengerId, Survived = Prediction)
write.csv(submit, file = "myfirstdtree.csv", row.names = FALSE)

Data Science Vidhya