Final Titanic

Faza Hikmatullah

Machine Learning Tutorial

December 8, 2018

Optimizing machine learning algorithm and massaging dataset from Kaggle.com: Titanic.

In [ ]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
In [ ]:
train = pd.read_csv('../input/train.csv')
test = pd.read_csv('../input/test.csv')

train.head()

Mapping

In [ ]:
# Combine both dataset in order to manipulate them both
train_test_data = [train, test]
In [ ]:
# Map sex as male into 0 and female into 1

sex_mapping = {"male": 0, "female": 1}
for dataset in train_test_data:
    dataset['Sex'] = dataset['Sex'].map(sex_mapping)

Binning

Binning/Converting Numerical Value to Categorical Variable

In [ ]:
# Binning age

# Fill the unknown age using median based on the title
train["Age"].fillna(30, inplace=True)
test["Age"].fillna(30, inplace=True)

for dataset in train_test_data:
    dataset.loc[ dataset['Age'] <= 16, 'Age'] = 4,
    dataset.loc[(dataset['Age'] > 16) & (dataset['Age'] <= 26), 'Age'] = 4,
    dataset.loc[(dataset['Age'] > 26) & (dataset['Age'] <= 36), 'Age'] = 3,
    dataset.loc[(dataset['Age'] > 36) & (dataset['Age'] <= 62), 'Age'] = 2,
    dataset.loc[ dataset['Age'] > 62, 'Age'] = 1
In [ ]:
# Binning fare

for dataset in train_test_data:
    dataset.loc[ dataset['Fare'] <= 17, 'Fare'] = 0,
    dataset.loc[(dataset['Fare'] > 17) & (dataset['Fare'] <= 30), 'Fare'] = 1,
    dataset.loc[(dataset['Fare'] > 30) & (dataset['Fare'] <= 100), 'Fare'] = 2,
    dataset.loc[ dataset['Fare'] > 100, 'Fare'] = 3
In [ ]:
#Remove unnecessary features

features_drop = ['Ticket', 'SibSp', 'Parch', 'Name', 'Embarked', 'Cabin']
train = train.drop(features_drop, axis=1)
test = test.drop(features_drop, axis=1)
train = train.drop(['PassengerId'], axis=1)

Processed Train Data

In [ ]:
train

Raw Train Data

In [ ]:
raw_train = pd.read_csv('../input/train.csv')
raw_train.head()

Processed Test Data

In [ ]:
test.head()

Raw Test Data

In [ ]:
raw_test = pd.read_csv('../input/test.csv')
raw_test.head()

Training

In [ ]:
from sklearn.svm import SVC
In [ ]:
# Prepare the data to feed into the model

train_data = train.drop('Survived', axis=1)
target = train['Survived']

Tunning

In [ ]:
# Randomize the data

from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
k_fold = KFold(n_splits=10, shuffle=True, random_state=0)
In [ ]:
scoring = 'accuracy'

kernels = ['linear', 'poly', 'rbf', 'sigmoid']
degrees = range(5)
results = []

def test_model(kernel, gamma = 'auto', degree = 3):
    clf = SVC(kernel = kernel_model, gamma = gamma, degree = degree)
    score = cross_val_score(clf, train_data, target, cv=k_fold, n_jobs=1, scoring=scoring)
    score = (np.mean(score)*100, 2)
    return (kernel, score, gamma, degree)

for kernel_model in kernels:
    for degree in degrees:
        result = test_model(kernel = kernel_model, degree = degree, gamma = 0.9)
        results.append(result)
            

results
In [ ]:
 
In [ ]: