{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "import pandas as pd"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 60,
   "metadata": {},
   "outputs": [],
   "source": [
    "np.random.seed(42)\n",
    "\n",
    "# To plot pretty figures\n",
    "%matplotlib inline\n",
    "import matplotlib\n",
    "import matplotlib.pyplot as plt\n",
    "import os\n",
    "import string\n",
    "from sklearn.model_selection import train_test_split\n",
    "from sklearn.feature_extraction.text import TfidfVectorizer"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [],
   "source": [
    "\n",
    "SPAM_PATH = os.path.join(\"datasets\", \"spam\")\n",
    "def load_spam_data(spam_path=SPAM_PATH):\n",
    "    csv_path = os.path.join(spam_path, \"spam.csv\")\n",
    "    return pd.read_csv(csv_path)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [],
   "source": [
    "spam = pd.read_csv(\"./datasets/spam/spam.csv\", encoding = \"ISO-8859-1\", engine='python')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 61,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Text</th>\n",
       "      <th>Label</th>\n",
       "      <th>Target</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>go until jurong point crazy available only in ...</td>\n",
       "      <td>ham</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>ok lar joking wif u oni</td>\n",
       "      <td>ham</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>free entry in 2 a wkly comp to win fa cup fina...</td>\n",
       "      <td>spam</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>u dun say so early hor u c already then say</td>\n",
       "      <td>ham</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>nah i dont think he goes to usf he lives aroun...</td>\n",
       "      <td>ham</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                                                Text Label  Target\n",
       "0  go until jurong point crazy available only in ...   ham       1\n",
       "1                            ok lar joking wif u oni   ham       1\n",
       "2  free entry in 2 a wkly comp to win fa cup fina...  spam       0\n",
       "3        u dun say so early hor u c already then say   ham       1\n",
       "4  nah i dont think he goes to usf he lives aroun...   ham       1"
      ]
     },
     "execution_count": 61,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "spam.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [],
   "source": [
    "spam.drop(spam.columns[[2,3,4]], axis = 1, inplace=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "ham     4825\n",
       "spam     747\n",
       "Name: v1, dtype: int64"
      ]
     },
     "execution_count": 21,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "spam[\"v1\"].value_counts()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {},
   "outputs": [],
   "source": [
    "spam.rename(columns={'v1':'Label', 'v2':'Text'}, inplace=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "metadata": {},
   "outputs": [],
   "source": [
    "spam['Target'] = spam['Label'].map({'ham': 1, 'spam': 0})"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "metadata": {},
   "outputs": [],
   "source": [
    "spam = spam[['Text', 'Label', 'Target']]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 49,
   "metadata": {},
   "outputs": [],
   "source": [
    "clean_spam = spam"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 50,
   "metadata": {},
   "outputs": [],
   "source": [
    "clean_spam.Text = clean_spam.Text.apply(lambda x: x.lower())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 58,
   "metadata": {},
   "outputs": [],
   "source": [
    "clean_spam.Text = clean_spam.Text.apply(lambda x: x.translate(str.maketrans('','',string.punctuation)))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 62,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Text</th>\n",
       "      <th>Label</th>\n",
       "      <th>Target</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>go until jurong point crazy available only in ...</td>\n",
       "      <td>ham</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>ok lar joking wif u oni</td>\n",
       "      <td>ham</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>free entry in 2 a wkly comp to win fa cup fina...</td>\n",
       "      <td>spam</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>u dun say so early hor u c already then say</td>\n",
       "      <td>ham</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>nah i dont think he goes to usf he lives aroun...</td>\n",
       "      <td>ham</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                                                Text Label  Target\n",
       "0  go until jurong point crazy available only in ...   ham       1\n",
       "1                            ok lar joking wif u oni   ham       1\n",
       "2  free entry in 2 a wkly comp to win fa cup fina...  spam       0\n",
       "3        u dun say so early hor u c already then say   ham       1\n",
       "4  nah i dont think he goes to usf he lives aroun...   ham       1"
      ]
     },
     "execution_count": 62,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "clean_spam.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 64,
   "metadata": {},
   "outputs": [],
   "source": [
    "vectorizer = TfidfVectorizer(\"english\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 66,
   "metadata": {},
   "outputs": [],
   "source": [
    "X = vectorizer.fit_transform(clean_spam['Text'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 67,
   "metadata": {},
   "outputs": [],
   "source": [
    "y = clean_spam['Target']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 68,
   "metadata": {},
   "outputs": [],
   "source": [
    "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, train_size = 0.8)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 70,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[CV]  ................................................................\n",
      "[CV] ....................... , score=0.9475100942126514, total=   0.0s\n",
      "[CV]  ................................................................\n",
      "[CV] ....................... , score=0.9434724091520862, total=   0.0s\n",
      "[CV]  ................................................................\n",
      "[CV] ........................ , score=0.934006734006734, total=   0.0s\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s\n",
      "[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    0.0s remaining:    0.0s\n",
      "[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:    0.0s finished\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "0.9416630791238237"
      ]
     },
     "execution_count": 70,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from sklearn.linear_model import LogisticRegression\n",
    "from sklearn.model_selection import cross_val_score\n",
    "log_clf = LogisticRegression(random_state=42)\n",
    "score = cross_val_score(log_clf, X_train, y_train, cv=3, verbose=3)\n",
    "score.mean()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 72,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Precision: 95.26%\n",
      "Recall: 100.00%\n"
     ]
    }
   ],
   "source": [
    "from sklearn.metrics import precision_score, recall_score\n",
    "\n",
    "log_clf = LogisticRegression(random_state=42)\n",
    "log_clf.fit(X_train, y_train)\n",
    "\n",
    "y_pred = log_clf.predict(X_test)\n",
    "\n",
    "print(\"Precision: {:.2f}%\".format(100 * precision_score(y_test, y_pred)))\n",
    "print(\"Recall: {:.2f}%\".format(100 * recall_score(y_test, y_pred)))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.5"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}