3 min read

Logistic Regression - Titanic Example

This is the competition on Kaggle: https://www.kaggle.com/c/titanic

Import Data:

library(tidyverse)
df_train <- read_csv('data/train.csv')
df_test <- read_csv('data/test.csv')

1. Gender Submission

Sex is the most important feature variable. Submit it by setting Survival to 1 if Sex == Female and 0 otherwise.

# gender submission
df_test %>% mutate(Survived = if_else(Sex == 'Female',
                                      true = 1L,
                                      false = 0L)) %>% 
  select(PassengerId, Survived) %>% 
  write.csv('gender_submission_test.csv', row.names = FALSE)
kaggle competitions submit -c titanic -f gender_submission_test.csv -m 'Gender Submission Test'

Public Score: 0.62679

2. Logistic Regression with Gradient Descent

2.1 Define Gradient Descent Function

See more details here: https://ikocabiyik.com/logistic-regression-with-gradient-descent/

logit_gradient_descent <- function(df, iter = 10000, learning_rate, features, response){
  
  # initial settings
  alpha = learning_rate
  
  # initial theta values: set all of them to 1
  n_independent = length(features)
  theta = matrix(rep(1, n_independent+1), nrow = n_independent+1)
  m = nrow(df)
  
  # dependent and independent variables 
  y = as.matrix(df[response])
  x <- as.matrix(df[,features])
  x <- cbind(1, x)
  
  # define sigmoid function
  sigmoid <- function(x) {1/(1 + exp(-x))}
  
  # gradient descent function
  grad <- function(x, y, theta){
    m <- nrow(x)
    hx <- sigmoid(x %*% theta)
    (1/m)*(t(x)%*%(hx - y))
  }
  
  cost_values = NA
  # run gradient descent
  for (i in 1:iter){
    theta <- theta-alpha*grad(x, y, theta)
    
    # define cost function
    cost <- function(x, y, theta){
      m <- nrow(x)
      hx <- sigmoid(x %*% theta)
      (1/m)*(((-t(y)%*%log(hx))-t(1-y)%*%log(1-hx)))
    }
    
    cost_values[i] <- cost(x, y, theta)[1,1]
    if(i > 1){
      if(cost_values[i] > cost_values[i-1]) {
        warning("Cost is not decreasing in each iteration. Lower the learning_rate")
        break
      } 
    }
  }
  theta
}

2.1 Define Prediction Function

logit_predict <- function(df, theta, features) {
  sigmoid <- function(x) {1/(1 + exp(-x))}
  x <- as.matrix(df[,features])
  x <- cbind(1, x)
  sigmoid(x %*% theta) %>%
    as_data_frame()
}

2.3 Run Gradient Descent for the Training Set

df_train <- df_train %>% 
  mutate(Sex = if_else(Sex =="male",
                       1L,
                       0L))

df_test <- df_test %>% 
  mutate(Sex = if_else(Sex =="male",
                       1L,
                       0L))


theta <- logit_gradient_descent(df = df_train,
                                iter = 10000,
                                learning_rate = 0.1,
                                features = c("Sex", "Pclass"),
                                response = 'Survived')
theta
##          Survived
##         3.2945547
## Sex    -2.6433600
## Pclass -0.9605266

2.4 Check the performance on the Test set

predictions_train <- logit_predict(df_train, theta, features = c("Sex", "Pclass"))
df_train$predicted <- predictions_train$Survived
df_train %>% mutate(predicted = as.integer(round(predicted))) %>% 
  mutate(has_predicted_correctly = if_else(Survived == predicted, true = "yes", false = "no")) %>% 
  group_by(has_predicted_correctly) %>% 
  count()
## # A tibble: 2 x 2
## # Groups:   has_predicted_correctly [2]
##   has_predicted_correctly     n
##   <chr>                   <int>
## 1 no                        190
## 2 yes                       701

Score on Test Set: 0.7867565

2.5 Making Predictions on Test Set and Write a File for Submission

predictions <- logit_predict(df_test, theta, features = c("Sex", "Pclass"))
data_frame(PassengerId = df_test$PassengerId,
           Survived = as.integer(round(predictions$Survived))) %>% 
  write_csv("logit_gender_pclass.csv")
kaggle competitions submit -c titanic -f logit_gender_pclass.csv -m 'Logistic Regression Gradient Descent Gender and Pclass'

Public Score: 0.76555

2.6 Adding more Features

df_train <- df_train %>%
  mutate(Embarked = case_when(Embarked == "S" ~ 1L,
                              Embarked == "C" ~ 2L,
                              Embarked == "Q" ~ 3L,
                              is.na(Embarked) ~ 4L,
                              TRUE ~ 5L))

df_test <- df_test %>%
  mutate(Embarked = case_when(Embarked == "S" ~ 1L,
                              Embarked == "C" ~ 2L,
                              Embarked == "Q" ~ 3L,
                              is.na(Embarked) ~ 4L,
                              TRUE ~ 5L))

theta <- logit_gradient_descent(df = df_train, iter = 100000, learning_rate = 0.001,
                                features = c("Sex", "Pclass", "SibSp", "Embarked"), response = 'Survived')

predictions_train <- logit_predict(df_train, theta, features = c("Sex", "Pclass", "SibSp", "Embarked"))
df_train$predicted <- predictions_train$Survived
df_train %>% mutate(predicted = as.integer(round(predicted))) %>% 
  mutate(is_prediction_true = Survived == predicted) %>% 
  group_by(is_prediction_true) %>% 
  count()
## # A tibble: 2 x 2
## # Groups:   is_prediction_true [2]
##   is_prediction_true     n
##   <lgl>              <int>
## 1 F                    176
## 2 T                    715
predictions <- logit_predict(df_test, theta, features = c("Sex", "Pclass", "SibSp", "Embarked"))

data_frame(PassengerId = df_test$PassengerId,
           Survived = as.integer(round(predictions$Survived))) %>% 
  write_csv("logit_gender_pclass_sibsp_embarked.csv")
kaggle competitions submit -c titanic -f logit_gender_pclass_sibsp_embarked.csv -m 'Logistic Regression Gradient Descent Gender, Pclass, SibSp and Embarked'

Public Score: 0.77511