Optimizing machine learning algorithm and massaging dataset from Kaggle.com: Titanic.
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
train = pd.read_csv('../input/train.csv')
test = pd.read_csv('../input/test.csv')
train.head()
# Combine both dataset in order to manipulate them both
train_test_data = [train, test]
# Map sex as male into 0 and female into 1
sex_mapping = {"male": 0, "female": 1}
for dataset in train_test_data:
dataset['Sex'] = dataset['Sex'].map(sex_mapping)
Binning/Converting Numerical Value to Categorical Variable
# Binning age
# Fill the unknown age using median based on the title
train["Age"].fillna(30, inplace=True)
test["Age"].fillna(30, inplace=True)
for dataset in train_test_data:
dataset.loc[ dataset['Age'] <= 16, 'Age'] = 4,
dataset.loc[(dataset['Age'] > 16) & (dataset['Age'] <= 26), 'Age'] = 4,
dataset.loc[(dataset['Age'] > 26) & (dataset['Age'] <= 36), 'Age'] = 3,
dataset.loc[(dataset['Age'] > 36) & (dataset['Age'] <= 62), 'Age'] = 2,
dataset.loc[ dataset['Age'] > 62, 'Age'] = 1
# Binning fare
for dataset in train_test_data:
dataset.loc[ dataset['Fare'] <= 17, 'Fare'] = 0,
dataset.loc[(dataset['Fare'] > 17) & (dataset['Fare'] <= 30), 'Fare'] = 1,
dataset.loc[(dataset['Fare'] > 30) & (dataset['Fare'] <= 100), 'Fare'] = 2,
dataset.loc[ dataset['Fare'] > 100, 'Fare'] = 3
#Remove unnecessary features
features_drop = ['Ticket', 'SibSp', 'Parch', 'Name', 'Embarked', 'Cabin']
train = train.drop(features_drop, axis=1)
test = test.drop(features_drop, axis=1)
train = train.drop(['PassengerId'], axis=1)
train
raw_train = pd.read_csv('../input/train.csv')
raw_train.head()
test.head()
raw_test = pd.read_csv('../input/test.csv')
raw_test.head()
from sklearn.svm import SVC
# Prepare the data to feed into the model
train_data = train.drop('Survived', axis=1)
target = train['Survived']
# Randomize the data
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
k_fold = KFold(n_splits=10, shuffle=True, random_state=0)
scoring = 'accuracy'
kernels = ['linear', 'poly', 'rbf', 'sigmoid']
degrees = range(5)
results = []
def test_model(kernel, gamma = 'auto', degree = 3):
clf = SVC(kernel = kernel_model, gamma = gamma, degree = degree)
score = cross_val_score(clf, train_data, target, cv=k_fold, n_jobs=1, scoring=scoring)
score = (np.mean(score)*100, 2)
return (kernel, score, gamma, degree)
for kernel_model in kernels:
for degree in degrees:
result = test_model(kernel = kernel_model, degree = degree, gamma = 0.9)
results.append(result)
results