{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "import numpy as np\n", "import pandas as pd" ] }, { "cell_type": "code", "execution_count": 60, "metadata": {}, "outputs": [], "source": [ "np.random.seed(42)\n", "\n", "# To plot pretty figures\n", "%matplotlib inline\n", "import matplotlib\n", "import matplotlib.pyplot as plt\n", "import os\n", "import string\n", "from sklearn.model_selection import train_test_split\n", "from sklearn.feature_extraction.text import TfidfVectorizer" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [], "source": [ "\n", "SPAM_PATH = os.path.join(\"datasets\", \"spam\")\n", "def load_spam_data(spam_path=SPAM_PATH):\n", " csv_path = os.path.join(spam_path, \"spam.csv\")\n", " return pd.read_csv(csv_path)\n" ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [], "source": [ "spam = pd.read_csv(\"./datasets/spam/spam.csv\", encoding = \"ISO-8859-1\", engine='python')" ] }, { "cell_type": "code", "execution_count": 61, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
TextLabelTarget
0go until jurong point crazy available only in ...ham1
1ok lar joking wif u oniham1
2free entry in 2 a wkly comp to win fa cup fina...spam0
3u dun say so early hor u c already then sayham1
4nah i dont think he goes to usf he lives aroun...ham1
\n", "
" ], "text/plain": [ " Text Label Target\n", "0 go until jurong point crazy available only in ... ham 1\n", "1 ok lar joking wif u oni ham 1\n", "2 free entry in 2 a wkly comp to win fa cup fina... spam 0\n", "3 u dun say so early hor u c already then say ham 1\n", "4 nah i dont think he goes to usf he lives aroun... ham 1" ] }, "execution_count": 61, "metadata": {}, "output_type": "execute_result" } ], "source": [ "spam.head()" ] }, { "cell_type": "code", "execution_count": 16, "metadata": {}, "outputs": [], "source": [ "spam.drop(spam.columns[[2,3,4]], axis = 1, inplace=True)" ] }, { "cell_type": "code", "execution_count": 21, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "ham 4825\n", "spam 747\n", "Name: v1, dtype: int64" ] }, "execution_count": 21, "metadata": {}, "output_type": "execute_result" } ], "source": [ "spam[\"v1\"].value_counts()" ] }, { "cell_type": "code", "execution_count": 23, "metadata": {}, "outputs": [], "source": [ "spam.rename(columns={'v1':'Label', 'v2':'Text'}, inplace=True)" ] }, { "cell_type": "code", "execution_count": 26, "metadata": {}, "outputs": [], "source": [ "spam['Target'] = spam['Label'].map({'ham': 1, 'spam': 0})" ] }, { "cell_type": "code", "execution_count": 28, "metadata": {}, "outputs": [], "source": [ "spam = spam[['Text', 'Label', 'Target']]" ] }, { "cell_type": "code", "execution_count": 49, "metadata": {}, "outputs": [], "source": [ "clean_spam = spam" ] }, { "cell_type": "code", "execution_count": 50, "metadata": {}, "outputs": [], "source": [ "clean_spam.Text = clean_spam.Text.apply(lambda x: x.lower())" ] }, { "cell_type": "code", "execution_count": 58, "metadata": {}, "outputs": [], "source": [ "clean_spam.Text = clean_spam.Text.apply(lambda x: x.translate(str.maketrans('','',string.punctuation)))" ] }, { "cell_type": "code", "execution_count": 62, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
TextLabelTarget
0go until jurong point crazy available only in ...ham1
1ok lar joking wif u oniham1
2free entry in 2 a wkly comp to win fa cup fina...spam0
3u dun say so early hor u c already then sayham1
4nah i dont think he goes to usf he lives aroun...ham1
\n", "
" ], "text/plain": [ " Text Label Target\n", "0 go until jurong point crazy available only in ... ham 1\n", "1 ok lar joking wif u oni ham 1\n", "2 free entry in 2 a wkly comp to win fa cup fina... spam 0\n", "3 u dun say so early hor u c already then say ham 1\n", "4 nah i dont think he goes to usf he lives aroun... ham 1" ] }, "execution_count": 62, "metadata": {}, "output_type": "execute_result" } ], "source": [ "clean_spam.head()" ] }, { "cell_type": "code", "execution_count": 64, "metadata": {}, "outputs": [], "source": [ "vectorizer = TfidfVectorizer(\"english\")" ] }, { "cell_type": "code", "execution_count": 66, "metadata": {}, "outputs": [], "source": [ "X = vectorizer.fit_transform(clean_spam['Text'])" ] }, { "cell_type": "code", "execution_count": 67, "metadata": {}, "outputs": [], "source": [ "y = clean_spam['Target']" ] }, { "cell_type": "code", "execution_count": 68, "metadata": {}, "outputs": [], "source": [ "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, train_size = 0.8)" ] }, { "cell_type": "code", "execution_count": 70, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[CV] ................................................................\n", "[CV] ....................... , score=0.9475100942126514, total= 0.0s\n", "[CV] ................................................................\n", "[CV] ....................... , score=0.9434724091520862, total= 0.0s\n", "[CV] ................................................................\n", "[CV] ........................ , score=0.934006734006734, total= 0.0s\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "[Parallel(n_jobs=1)]: Done 1 out of 1 | elapsed: 0.0s remaining: 0.0s\n", "[Parallel(n_jobs=1)]: Done 2 out of 2 | elapsed: 0.0s remaining: 0.0s\n", "[Parallel(n_jobs=1)]: Done 3 out of 3 | elapsed: 0.0s finished\n" ] }, { "data": { "text/plain": [ "0.9416630791238237" ] }, "execution_count": 70, "metadata": {}, "output_type": "execute_result" } ], "source": [ "from sklearn.linear_model import LogisticRegression\n", "from sklearn.model_selection import cross_val_score\n", "log_clf = LogisticRegression(random_state=42)\n", "score = cross_val_score(log_clf, X_train, y_train, cv=3, verbose=3)\n", "score.mean()" ] }, { "cell_type": "code", "execution_count": 72, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Precision: 95.26%\n", "Recall: 100.00%\n" ] } ], "source": [ "from sklearn.metrics import precision_score, recall_score\n", "\n", "log_clf = LogisticRegression(random_state=42)\n", "log_clf.fit(X_train, y_train)\n", "\n", "y_pred = log_clf.predict(X_test)\n", "\n", "print(\"Precision: {:.2f}%\".format(100 * precision_score(y_test, y_pred)))\n", "print(\"Recall: {:.2f}%\".format(100 * recall_score(y_test, y_pred)))" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.6.5" } }, "nbformat": 4, "nbformat_minor": 2 }