{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"import numpy as np\n",
"import pandas as pd"
]
},
{
"cell_type": "code",
"execution_count": 60,
"metadata": {},
"outputs": [],
"source": [
"np.random.seed(42)\n",
"\n",
"# To plot pretty figures\n",
"%matplotlib inline\n",
"import matplotlib\n",
"import matplotlib.pyplot as plt\n",
"import os\n",
"import string\n",
"from sklearn.model_selection import train_test_split\n",
"from sklearn.feature_extraction.text import TfidfVectorizer"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {},
"outputs": [],
"source": [
"\n",
"SPAM_PATH = os.path.join(\"datasets\", \"spam\")\n",
"def load_spam_data(spam_path=SPAM_PATH):\n",
" csv_path = os.path.join(spam_path, \"spam.csv\")\n",
" return pd.read_csv(csv_path)\n"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {},
"outputs": [],
"source": [
"spam = pd.read_csv(\"./datasets/spam/spam.csv\", encoding = \"ISO-8859-1\", engine='python')"
]
},
{
"cell_type": "code",
"execution_count": 61,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"
\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" Text | \n",
" Label | \n",
" Target | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" go until jurong point crazy available only in ... | \n",
" ham | \n",
" 1 | \n",
"
\n",
" \n",
" 1 | \n",
" ok lar joking wif u oni | \n",
" ham | \n",
" 1 | \n",
"
\n",
" \n",
" 2 | \n",
" free entry in 2 a wkly comp to win fa cup fina... | \n",
" spam | \n",
" 0 | \n",
"
\n",
" \n",
" 3 | \n",
" u dun say so early hor u c already then say | \n",
" ham | \n",
" 1 | \n",
"
\n",
" \n",
" 4 | \n",
" nah i dont think he goes to usf he lives aroun... | \n",
" ham | \n",
" 1 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" Text Label Target\n",
"0 go until jurong point crazy available only in ... ham 1\n",
"1 ok lar joking wif u oni ham 1\n",
"2 free entry in 2 a wkly comp to win fa cup fina... spam 0\n",
"3 u dun say so early hor u c already then say ham 1\n",
"4 nah i dont think he goes to usf he lives aroun... ham 1"
]
},
"execution_count": 61,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"spam.head()"
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {},
"outputs": [],
"source": [
"spam.drop(spam.columns[[2,3,4]], axis = 1, inplace=True)"
]
},
{
"cell_type": "code",
"execution_count": 21,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"ham 4825\n",
"spam 747\n",
"Name: v1, dtype: int64"
]
},
"execution_count": 21,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"spam[\"v1\"].value_counts()"
]
},
{
"cell_type": "code",
"execution_count": 23,
"metadata": {},
"outputs": [],
"source": [
"spam.rename(columns={'v1':'Label', 'v2':'Text'}, inplace=True)"
]
},
{
"cell_type": "code",
"execution_count": 26,
"metadata": {},
"outputs": [],
"source": [
"spam['Target'] = spam['Label'].map({'ham': 1, 'spam': 0})"
]
},
{
"cell_type": "code",
"execution_count": 28,
"metadata": {},
"outputs": [],
"source": [
"spam = spam[['Text', 'Label', 'Target']]"
]
},
{
"cell_type": "code",
"execution_count": 49,
"metadata": {},
"outputs": [],
"source": [
"clean_spam = spam"
]
},
{
"cell_type": "code",
"execution_count": 50,
"metadata": {},
"outputs": [],
"source": [
"clean_spam.Text = clean_spam.Text.apply(lambda x: x.lower())"
]
},
{
"cell_type": "code",
"execution_count": 58,
"metadata": {},
"outputs": [],
"source": [
"clean_spam.Text = clean_spam.Text.apply(lambda x: x.translate(str.maketrans('','',string.punctuation)))"
]
},
{
"cell_type": "code",
"execution_count": 62,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" Text | \n",
" Label | \n",
" Target | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" go until jurong point crazy available only in ... | \n",
" ham | \n",
" 1 | \n",
"
\n",
" \n",
" 1 | \n",
" ok lar joking wif u oni | \n",
" ham | \n",
" 1 | \n",
"
\n",
" \n",
" 2 | \n",
" free entry in 2 a wkly comp to win fa cup fina... | \n",
" spam | \n",
" 0 | \n",
"
\n",
" \n",
" 3 | \n",
" u dun say so early hor u c already then say | \n",
" ham | \n",
" 1 | \n",
"
\n",
" \n",
" 4 | \n",
" nah i dont think he goes to usf he lives aroun... | \n",
" ham | \n",
" 1 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" Text Label Target\n",
"0 go until jurong point crazy available only in ... ham 1\n",
"1 ok lar joking wif u oni ham 1\n",
"2 free entry in 2 a wkly comp to win fa cup fina... spam 0\n",
"3 u dun say so early hor u c already then say ham 1\n",
"4 nah i dont think he goes to usf he lives aroun... ham 1"
]
},
"execution_count": 62,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"clean_spam.head()"
]
},
{
"cell_type": "code",
"execution_count": 64,
"metadata": {},
"outputs": [],
"source": [
"vectorizer = TfidfVectorizer(\"english\")"
]
},
{
"cell_type": "code",
"execution_count": 66,
"metadata": {},
"outputs": [],
"source": [
"X = vectorizer.fit_transform(clean_spam['Text'])"
]
},
{
"cell_type": "code",
"execution_count": 67,
"metadata": {},
"outputs": [],
"source": [
"y = clean_spam['Target']"
]
},
{
"cell_type": "code",
"execution_count": 68,
"metadata": {},
"outputs": [],
"source": [
"X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, train_size = 0.8)"
]
},
{
"cell_type": "code",
"execution_count": 70,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"[CV] ................................................................\n",
"[CV] ....................... , score=0.9475100942126514, total= 0.0s\n",
"[CV] ................................................................\n",
"[CV] ....................... , score=0.9434724091520862, total= 0.0s\n",
"[CV] ................................................................\n",
"[CV] ........................ , score=0.934006734006734, total= 0.0s\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"[Parallel(n_jobs=1)]: Done 1 out of 1 | elapsed: 0.0s remaining: 0.0s\n",
"[Parallel(n_jobs=1)]: Done 2 out of 2 | elapsed: 0.0s remaining: 0.0s\n",
"[Parallel(n_jobs=1)]: Done 3 out of 3 | elapsed: 0.0s finished\n"
]
},
{
"data": {
"text/plain": [
"0.9416630791238237"
]
},
"execution_count": 70,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"from sklearn.linear_model import LogisticRegression\n",
"from sklearn.model_selection import cross_val_score\n",
"log_clf = LogisticRegression(random_state=42)\n",
"score = cross_val_score(log_clf, X_train, y_train, cv=3, verbose=3)\n",
"score.mean()"
]
},
{
"cell_type": "code",
"execution_count": 72,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Precision: 95.26%\n",
"Recall: 100.00%\n"
]
}
],
"source": [
"from sklearn.metrics import precision_score, recall_score\n",
"\n",
"log_clf = LogisticRegression(random_state=42)\n",
"log_clf.fit(X_train, y_train)\n",
"\n",
"y_pred = log_clf.predict(X_test)\n",
"\n",
"print(\"Precision: {:.2f}%\".format(100 * precision_score(y_test, y_pred)))\n",
"print(\"Recall: {:.2f}%\".format(100 * recall_score(y_test, y_pred)))"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.5"
}
},
"nbformat": 4,
"nbformat_minor": 2
}