In [1]:
import numpy as np
import pandas as pd

In [60]:
np.random.seed(42)

# To plot pretty figures
%matplotlib inline
import matplotlib
import matplotlib.pyplot as plt
import os
import string
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

In [13]:

SPAM_PATH = os.path.join("datasets", "spam")
def load_spam_data(spam_path=SPAM_PATH):
    csv_path = os.path.join(spam_path, "spam.csv")
    return pd.read_csv(csv_path)


In [14]:
spam = pd.read_csv("./datasets/spam/spam.csv", encoding = "ISO-8859-1", engine='python')

In [61]:
spam.head()

Unnamed: 0,Text,Label,Target
0,go until jurong point crazy available only in ...,ham,1
1,ok lar joking wif u oni,ham,1
2,free entry in 2 a wkly comp to win fa cup fina...,spam,0
3,u dun say so early hor u c already then say,ham,1
4,nah i dont think he goes to usf he lives aroun...,ham,1


In [16]:
spam.drop(spam.columns[[2,3,4]], axis = 1, inplace=True)

In [21]:
spam["v1"].value_counts()

ham     4825
spam     747
Name: v1, dtype: int64

In [23]:
spam.rename(columns={'v1':'Label', 'v2':'Text'}, inplace=True)

In [26]:
spam['Target'] = spam['Label'].map({'ham': 1, 'spam': 0})

In [28]:
spam = spam[['Text', 'Label', 'Target']]

In [49]:
clean_spam = spam

In [50]:
clean_spam.Text = clean_spam.Text.apply(lambda x: x.lower())

In [58]:
clean_spam.Text = clean_spam.Text.apply(lambda x: x.translate(str.maketrans('','',string.punctuation)))

In [62]:
clean_spam.head()

Unnamed: 0,Text,Label,Target
0,go until jurong point crazy available only in ...,ham,1
1,ok lar joking wif u oni,ham,1
2,free entry in 2 a wkly comp to win fa cup fina...,spam,0
3,u dun say so early hor u c already then say,ham,1
4,nah i dont think he goes to usf he lives aroun...,ham,1


In [64]:
vectorizer = TfidfVectorizer("english")

In [66]:
X = vectorizer.fit_transform(clean_spam['Text'])

In [67]:
y = clean_spam['Target']

In [68]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, train_size = 0.8)

In [70]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
log_clf = LogisticRegression(random_state=42)
score = cross_val_score(log_clf, X_train, y_train, cv=3, verbose=3)
score.mean()

[CV]  ................................................................
[CV] ....................... , score=0.9475100942126514, total=   0.0s
[CV]  ................................................................
[CV] ....................... , score=0.9434724091520862, total=   0.0s
[CV]  ................................................................
[CV] ........................ , score=0.934006734006734, total=   0.0s


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:    0.0s finished


0.9416630791238237

In [72]:
from sklearn.metrics import precision_score, recall_score

log_clf = LogisticRegression(random_state=42)
log_clf.fit(X_train, y_train)

y_pred = log_clf.predict(X_test)

print("Precision: {:.2f}%".format(100 * precision_score(y_test, y_pred)))
print("Recall: {:.2f}%".format(100 * recall_score(y_test, y_pred)))

Precision: 95.26%
Recall: 100.00%
