Mission475Solutions.Rmd

---
title: "Conditional Probability in R: Guided Project Solutions"
output: html_document
---

```{r, warning = FALSE, message = FALSE }
library(tidyverse)
set.seed(1)
```

# Introduction

This analysis is an application of what we've learned in Dataquest's Conditional Probability course. Using a dataset of pre-labeled SMS messages, we'll create a spam filter using the Naive Bayes algorithm.

# Data 

```{r}
spam = read.csv("./data/SMSSpamCollection", sep = "\t", header = F)
colnames(spam) = c("label", "sms")
```

The `spam` dataset has `r nrow(spam)` rows and `r ncol(spam)` columns. Of these messages, `r mean(spam$label == "ham") * 100`% of them are not classified as spam, the rest are spam.

# Dividing Up Into Training and Test Sets

```{r}
n = nrow(spam)
n.training = 2547
n.cv = 318
n.test = 319

# Create the random indices for training set
train.indices = sample(1:n, size = n.training, replace = FALSE)

# Get indices not used by the training set
remaining.indices = setdiff(1:n, train.indices)

# Remaining indices are already randomized, just allocate correctly
cv.indices = remaining.indices[1:318]
test.indices = remaining.indices[319:length(remaining.indices)]

# Use the indices to create each of the datasets
spam.train = spam[train.indices,]
spam.cv = spam[cv.indices,]
spam.test = spam[test.indices,]

# Sanity check: are the ratios of ham to spam relatively constant?
print(mean(spam.train$label == "ham"))
print(mean(spam.cv$label == "ham"))
print(mean(spam.test$label == "ham"))
```

The number of ham messages in each dataset is relatively close to the original 87%. These datasets look good for future analysis.

# Data Cleaning

```{r}
# To lowercase, removal of punctuation
tidy.train = spam.train %>% 
  mutate(
    sms = tolower(sms),
    sms = str_replace_all(sms, "[[:punct:]]", ""),
    sms = str_replace_all(sms, "[[:digit:]]", " "),
    sms = str_replace_all(sms, "[\u0094\u0092\n\t]", " ")
  )

# Creating the vocabulary
vocabulary = NULL
messages = pull(tidy.train, sms)

# Iterate through the messages and add to the vocabulary
for (m in messages) {
  words = str_split(m, " ")[[1]]
  words = words[!words %in% ""]
  vocabulary = c(vocabulary, words)
}
vocabulary = unique(vocabulary)
```

# Calculating Constants and Parameters

```{r}
# Calculating Constants
# Mean of a vector of logicals is a percentage
p.spam = mean(tidy.train$label == "spam")
p.ham = mean(tidy.train$label == "ham")

# Isolate the spam and ham messages
spam.messages = tidy.train %>% 
  filter(label == "spam") %>% 
  pull("sms")

ham.messages = tidy.train %>% 
  filter(label == "ham") %>% 
  pull("sms")

spam.words = NULL
for (sm in spam.messages) {
  words = str_split(sm, " ")[[1]]
  spam.words = c(spam.words, words)
}

ham.words = NULL
for (hm in ham.messages) {
  words = str_split(hm, " ")[[1]]
  ham.words = c(ham.words, words)
}

n.spam = length(unique(spam.words))
n.ham = length(unique(ham.words))
n.vocabulary = length(vocabulary)
alpha = 1
```

```{r}
# Calculating Parameters
spam.counts = list()
ham.counts = list()
spam.probs = list()
ham.probs = list()

# This might take a while to run with so many words
for (vocab in vocabulary) {
  
  # Initialize the counts for the word
  spam.counts[[vocab]] = 0
  ham.counts[[vocab]] = 0
  
  # Break up the message and count how many times word appears
  for (sm in spam.messages) {
    words = str_split(sm, " ")[[1]]
    spam.counts[[vocab]] = spam.counts[[vocab]] + sum(words == vocab)
  }
  
  for (hm in ham.messages) {
    words = str_split(hm, " ")[[1]]
    ham.counts[[vocab]] = ham.counts[[vocab]] + sum(words == vocab)
  }
  
  # Use the counts to calculate the probability
  spam.probs[[vocab]] = (spam.counts[[vocab]] + alpha) / (n.spam + alpha * n.vocabulary)
  ham.probs[[vocab]] = (ham.counts[[vocab]] + alpha) / (n.ham + alpha * n.vocabulary)
}

```

# Classifying New Messages

```{r}
classify = function(message) {
  
  # Initializing the probability product
  p.spam.given.message = p.spam
  p.ham.given.message = p.ham
  
  # Splitting and cleaning the new message
  clean.message = tolower(message)
  clean.message = str_replace_all(clean.message, "[[:punct:]]", "")
  clean.message = str_replace_all(clean.message, "[[:digit:]]", " ")
  clean.message = str_replace_all(clean.message, "[\u0094\u0092\n\t]", " ")
  words = str_split(clean.message, " ")[[1]]
  
  for (word in words) {
    
    # Extra check if word is not in vocabulary
    wi.spam.prob = ifelse(word %in% vocabulary, 
                          spam.probs[[word]],
                          1)
    wi.ham.prob = ifelse(word %in% vocabulary, 
                         ham.probs[[word]],
                        1)
    
    p.spam.given.message = p.spam.given.message * wi.spam.prob
    p.ham.given.message = p.ham.given.message * wi.ham.prob
  }
  
  result = case_when(
    p.spam.given.message >= p.ham.given.message ~ "spam",
    p.spam.given.message < p.ham.given.message ~ "ham")
  
  return(result)
}

final.train = tidy.train %>% 
  mutate(
    prediction = unlist(map(sms, classify))
  ) %>% 
  select(label, prediction, sms)


# Results of classification on training
confusion = table(final.train$label, final.train$prediction)
accuracy = (confusion[1,1] + confusion[2,2]) / nrow(final.train)
```

Roughly, the classifier achieves about 97% accuracy on the training set. We aren't interested in how well the classifier performs with training data though, the classifier has already "seen" all of these messages.

# Hyperparameter Tuning

```{r}
alpha.grid = seq(0.1, 1, by = 0.1)
cv.accuracy = NULL

for (a in alpha.grid) {
  
  spam.probs = list()
  ham.probs = list()

  # This might take a while to run with so many words
  for (vocab in vocabulary) {
    
    # Use the counts to calculate the probability
    spam.probs[[vocab]] = (spam.counts[[vocab]] + a) / (n.spam + a * n.vocabulary)
    ham.probs[[vocab]] = (ham.counts[[vocab]] + a) / (n.ham + a * n.vocabulary)
  }
  
  cv = spam.cv %>% 
    mutate(
      prediction = unlist(map(sms, classify))
    ) %>% 
  select(label, prediction, sms)
  
  confusion = table(cv$label, cv$prediction)
  acc = (confusion[1,1] + confusion[2,2]) / nrow(cv)
  cv.accuracy = c(cv.accuracy, acc)
}

cv.check = tibble(
  alpha = alpha.grid,
  accuracy = cv.accuracy
)
cv.check
```

Judging from the cross-validation set, higher $\alpha$ values cause the accuracy to decrease. We'll go with $\alpha = 0.1$ since it produces the highest cross-validation prediction accuracy.

# Test Set Performance

```{r}
# Reestablishing the  proper parameters
optimal.alpha = 0.1
for (a in alpha.grid) {
  
  spam.probs = list()
  ham.probs = list()

  # This might take a while to run with so many words
  for (vocab in vocabulary) {
    
    # Use the counts to calculate the probability
    spam.probs[[vocab]] = (spam.counts[[vocab]] + optimal.alpha) / (n.spam + optimal.alpha * n.vocabulary)
    ham.probs[[vocab]] = (ham.counts[[vocab]] + optimal.alpha) / (n.ham + optimal.alpha * n.vocabulary)
  }
}

spam.test = spam.test %>% 
  mutate(
    prediction = unlist(map(sms, classify))
    ) %>% 
  select(label, prediction, sms)
  
confusion = table(spam.test$label, spam.test$prediction)
test.accuracy = (confusion[1,1] + confusion[2,2]) / nrow(cv)
test.accuracy
```

We've achieved an accuracy of 93% in the test set. Not bad!