From 67c921fdf9ca0f49bda3d8b598ef5e3d56472679 Mon Sep 17 00:00:00 2001 From: Pascal Keilbach Date: Thu, 23 Nov 2023 20:40:15 +0100 Subject: [PATCH 1/6] add naive bayes lecture to nav --- docs/lectures/{part03 => }/naive_bayes.md | 0 mkdocs.yml | 1 + 2 files changed, 1 insertion(+) rename docs/lectures/{part03 => }/naive_bayes.md (100%) diff --git a/docs/lectures/part03/naive_bayes.md b/docs/lectures/naive_bayes.md similarity index 100% rename from docs/lectures/part03/naive_bayes.md rename to docs/lectures/naive_bayes.md diff --git a/mkdocs.yml b/mkdocs.yml index fa5c7b2..cd0215c 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -57,6 +57,7 @@ nav: - lectures/preprocessing.md - lectures/feature_extraction.md - lectures/logistic_regression.md + - lectures/naive_bayes.md - Assignments: assignments.md - Presentations: - presentations/presentations.md From b0c8aa083ef30fd9cc661402c3744a5feb2b8de8 Mon Sep 17 00:00:00 2001 From: Pascal Keilbach Date: Sun, 3 Dec 2023 10:51:26 +0100 Subject: [PATCH 2/6] add naive bayes lecture --- ...bayes-conditional-probability-1.drawio.svg | 76 +++ docs/img/naive-bayes-prediction.drawio.svg | 130 ++++ ...-bayes-probability-intersection.drawio.svg | 94 +++ docs/img/naive-bayes-training.drawio.svg | 133 ++++ docs/lectures/naive_bayes.md | 628 ++++++++++++++++++ 5 files changed, 1061 insertions(+) create mode 100644 docs/img/naive-bayes-conditional-probability-1.drawio.svg create mode 100644 docs/img/naive-bayes-prediction.drawio.svg create mode 100644 docs/img/naive-bayes-probability-intersection.drawio.svg create mode 100644 docs/img/naive-bayes-training.drawio.svg diff --git a/docs/img/naive-bayes-conditional-probability-1.drawio.svg b/docs/img/naive-bayes-conditional-probability-1.drawio.svg new file mode 100644 index 0000000..2f1ff2e --- /dev/null +++ b/docs/img/naive-bayes-conditional-probability-1.drawio.svg @@ -0,0 +1,76 @@ + + + + + + + + + + + +
+
+
+ "amazing" +
+
+
+
+ + "amazing" + +
+
+ + + + + + Corpus + + + + + +
+
+
+ "amazing" + + + + + + + + + + + + + + + positive +
+
+
+
+ + "amazing" \[\... + +
+
+ +
+ + + + + Text is not SVG - cannot display + + + +
\ No newline at end of file diff --git a/docs/img/naive-bayes-prediction.drawio.svg b/docs/img/naive-bayes-prediction.drawio.svg new file mode 100644 index 0000000..8005cd9 --- /dev/null +++ b/docs/img/naive-bayes-prediction.drawio.svg @@ -0,0 +1,130 @@ + + + + + + + + + + + + + + + + +
+
+
+ Preprocessing +
+
+
+
+ + Preprocessing + +
+
+ + + + + + + + + +
+
+
+ I am happy because I love ice cream +
+
+
+
+ + I am happy because I love ice cream + +
+
+ + + + + + + + + + + + + +
+
+
+ conditional probabilities +
+
+
+
+ + conditio... + +
+
+ + + + + + + + + +
+
+
+ [I, am, happy, because, I, love, ice, cream] +
+
+
+
+ + [I, am, happy, because, I, love, ice, cream] + +
+
+ + + + + + +
+
+
+ Prediction +
+
+
+
+ + Prediction + +
+
+
+ + + + + Text is not SVG - cannot display + + + +
\ No newline at end of file diff --git a/docs/img/naive-bayes-probability-intersection.drawio.svg b/docs/img/naive-bayes-probability-intersection.drawio.svg new file mode 100644 index 0000000..8e48a10 --- /dev/null +++ b/docs/img/naive-bayes-probability-intersection.drawio.svg @@ -0,0 +1,94 @@ + + + + + + + + + + + +
+
+
+ Positive +
+
+
+
+ + Positive + +
+
+ + + + + + +
+
+
+ "amazing" +
+
+
+
+ + "amazing" + +
+
+ + + + + + Corpus + + + + + +
+
+
+ "amazing" + + + + + + + + + + + + + + + positive +
+
+
+
+ + "amazing" \[\... + +
+
+
+ + + + + Text is not SVG - cannot display + + + +
\ No newline at end of file diff --git a/docs/img/naive-bayes-training.drawio.svg b/docs/img/naive-bayes-training.drawio.svg new file mode 100644 index 0000000..10c6580 --- /dev/null +++ b/docs/img/naive-bayes-training.drawio.svg @@ -0,0 +1,133 @@ + + + + + + + + + + + + + + + + +
+
+
+ Preprocessing +
+
+
+
+ + Preprocessing + +
+
+ + + + + + + + + + + +
+
+
+ Calculate Log Prior Ratio +
+
+
+
+ + Calculate Log Pri... + +
+
+ + + + + + Naive Bayes Training + + + + + + + + + + +
+
+
+ Compute table of word frequencies +
+
+
+
+ + Compute table of... + +
+
+ + + + + + + + +
+
+
+ Compute table of conditional probabilities +
+
+
+
+ + Compute table of... + +
+
+ + + + + +
+
+
+ Compute log ratio of conditional probabilities +
+
+
+
+ + Compute log ratio... + +
+
+
+ + + + + Text is not SVG - cannot display + + + +
\ No newline at end of file diff --git a/docs/lectures/naive_bayes.md b/docs/lectures/naive_bayes.md index 6bb80f3..206766f 100644 --- a/docs/lectures/naive_bayes.md +++ b/docs/lectures/naive_bayes.md @@ -1 +1,629 @@ # Naive Bayes + +In this lecture, we will learn about the Naive Bayes classifier for binary classification. + +Naive Bayes is a **simple but powerful** classifier that doesn't require to find any hyperparameters. + +It is very fast to train and predict, and can perform surprisingly well. + +## TL;DR + +When using Naive Bayes for binary classification, we need to calculate the **likelihood** of a tweet being positive or negative. + +To do this, we need to build the **log ratio of probabilities** for each word in the vocabulary. + +For example, if the word "happy" appears 20 times in positive tweets and 5 times in negative tweets, then the ratio of probabilities is $20/5=4$. This means that the word "happy" is more likely to appear in a positive tweet. If the ratio would be less than 1, then the word is more likely to appear in a negative tweet. + +Taking the logarithm is a mathematical trick to avoid **numerical underflow** and simplify the calculations. + +Using the **log ratio of probabilities**, we can calculate the log likelihood of a tweet being positive or negative by summing up the log ratio of probabilities for each word in the tweet, and thus, **predict** the class of the tweet. + +## Probability Recap + +Let + +- $A$ be the event that a tweet being labeled positive +- $N_{pos}$ be the number of positive tweets +- $N$ be the total number of tweets + +Then the probability $P(A)$ of a tweet being positive is the **number of positive tweets** divided by the **total number of tweets**: + +$$ +P(A) = \frac{N_{pos}}{N} +$$ + +!!! note + + For **binary classification**, if there a tweet can only be either positive or negative, then the probability of a tweet being negative is $1-P(A)$. + +!!! example + + If there are 35 positive tweets and 100 tweets in total, then the probability $P(A)$ of a **tweet being positive** is calculated as + + $$ + P(A)=35/100=0.35. + $$ + + The probability of the **tweet being negative** is then calculated as $1-P(A) = 0.65$. + +Let $B$ be the event that a tweet contains the word "amazing". Then the probability $P(B)$ of a **tweet containing the word "amazing"** is the number of tweets containing the word divided by the total number of tweets: + +$$ +P(B) = \frac{N_{amazing}}{N} +$$ + +!!! example + + If there are 5 tweets containing the word "amazing" and 100 tweets in total, then the probability $P(B)$ of a tweet containing the word "amazing" is calculated as + + $$ + P(B) = 5/100 = 0.05 + $$ + +## Intersection of Two Events + +Let $A \cap B$ be the event that a **tweet is positive and contains the word "amazing"**. + +Then the probability $P(A \cap B)$ is calculated as the number of tweets that are positive and contain the word "amazing" divided by the total number of tweets: + +$$ +P(A \cap B) = \frac{N_{pos \cap amazing}}{N} +$$ + +The following Venn diagram illustrates this: + +![Probability of the intersection of two events](../img/naive-bayes-probability-intersection.drawio.svg) + +!!! example + + Let's assume a corpus of 100 tweets: + + - 35 tweets are positive + - 65 tweets are negative + - 5 tweets contain the word "amazing", but one of them is negative (e.g. "I thought this movie was amazing, but it was actually terrible!") + + Then the probability $P(A \cap B)$ of a tweet being positive and containing the word "amazing" is calculated as + + $$ + P(A \cap B) = \frac{4}{100} = 0.04 + $$ + +## Conditional Probability + +Continuing the example from above, let's assume we want to calculate the probability of a tweet being positive, but knowing that the tweet contains the word "amazing". + +Looking at the diagram from above, this means we only consider the **blue circle**. + +In our example, this is the **probability of the intersection** of the tweets being positive and containing the word "amazing" divided by the probability of all tweets containing the word "amazing". + +This is called the **conditional probability** $P(A|B)$ of a tweet being positive, given that it contains the word "amazing". + +It is calculated as follows: + +$$ +P(A|B) = \frac{P(A \cap B)}{P(B)} +$$ + +!!! quote "Conditional Probability" + + The conditional probability $P(A|B)$ is the probability of event $A$ _given that_ event $B$ has occurred. + + For example, $P(\text{positive}|\text{happy})$ is the probability of a tweet being positive, given that it contains the word "happy". + +!!! example + + Let's continue the example from above, where we have 5 tweets containing the word "amazing" and 4 of them are positive. + + Then the probability $P(A|B)$ of a tweet being positive, given that it contains the word "amazing" is calculated as + + $$ + P(A|B) = \frac{P(A \cap B)}{P(B)} = \frac{4/100}{5/100} = \frac{4}{5} = 0.8 + $$ + +!!! example + + Now let's turn it around and calculate the probability of a tweet containing the word "amazing", given that it is positive. + + This is calculated as follows: + + $$ + P(B|A) = \frac{P(A \cap B)}{P(A)} = \frac{4/100}{35/100} = \frac{4}{35} = 0.114 + $$ + +## Bayes Rule + +Now we can derive Bayes Rule, which is based on conditional probabilities. + +We know that the conditional probability $P(A|B)$ is calculated as follows: + +$$ +P(A|B) = \frac{P(A \cap B)}{P(B)} +$$ + +and we also know that the conditional probability $P(B|A)$ is calculated as follows: + +$$ +P(B|A) = \frac{P(B \cap A)}{P(A)} +$$ + +Given that + +$$ +P(A \cap B) = P(B \cap A) +$$ + +we can rewrite the equation as follows: + +$$ +P(A|B) = \frac{P(B|A)P(A)}{P(B)} +$$ + +With that, we have **derived Bayes Rule**. + +!!! quote "Bayes Rule" + + Bayes Rule is a way to calculate the conditional probability $P(A|B)$, given that we know $P(B|A)$. + + It is calculated as follows: + + $$ + P(A|B) = \frac{P(B|A)P(A)}{P(B)} + $$ + +!!! example + + Suppose that in your dataset, 25% of the positive tweets contain the word "amazing". + You also know that a total of 13% of the tweets in your dataset contain the word "amazing", and that 40% of the total number of tweets are positive. + Given the tweet "amazing to be here". What is the probability that this tweet is positive? + + Let $A$ be the event that a tweet is positive and $B$ be the event that a tweet contains the word "amazing". + + $$ + \begin{aligned} + P(A) &= 0.4 \\ + P(B) &= 0.13 \\ + P(B|A) &= 0.25 \\ + P(A|B) &= \frac{P(B|A)P(A)}{P(B)} = \frac{0.25 \times 0.4}{0.13} = 0.769 + \end{aligned} + $$ + + The probability that the tweet "amazing to be here" is positive is 0.769. + +!!! info "Thomas Bayes" + + ![Thomas Bayes](https://upload.wikimedia.org/wikipedia/commons/d/d4/Thomas_Bayes.gif) + + [Thomas Bayes](https://en.wikipedia.org/wiki/Thomas_Bayes) (1701 - 1761) was an English statistician, philosopher and Presbyterian minister. + Bayes never published what would become his most famous accomplishment; his notes were edited and published posthumously by Richard Price. + + In NLP, Bayes' Theorem can be used for: + + - **classification**: given a document, what is the probability that it belongs to a certain class? (e.g. spam or not spam or sentiment analysis) + - **information retrieval**: given a query, what is the probability that a document is relevant? + - **word sense disambiguation**: given a word, what is the probability that it has a certain meaning? + + Besides machine learning and NLP, Bayes' Theorem is also used in many other fields, such as: + + - **medicine**: given a symptom, what is the probability that a patient has a certain disease? + - **biology**: given a genetic profile, what is the probability that a person will develop a certain disease? + - **economics**: given a set of economic conditions, what is the probability that the economy will be in a recession next year? + - **finance**: given a set of financial conditions, what is the probability that a stock will increase in value next year? + +## Laplacian Smoothing + +Using Bayes Rule, we can calculate the probability of a word given a class $P(w|c)$ as follows: + +$$ +P(w|c) = \frac{P(c|w)P(w)}{P(c)} = \frac{freq(w,c)}{N_c} +$$ + +However, if a word has not been seen in the training data, then $freq(w,c) = 0$ and thus, $P(w|c) = 0$. + +To account for this, we can use Laplacian Smoothing (aka Additive Smoothing). + +This is done by adding the **smoothing constant** $\alpha$ to the numerator and $\alpha|V|$ to the denominator: + +$$ +P(w|c) = \frac{freq(w,c) + \alpha}{N_c + \alpha|V|} +$$ + +!!! info + + If we add a constant $\alpha$ to the numerator, and since there are $|V|$ words in the vocabulary to normalize, we have to add $\alpha|V|$ to the denominator. + This way, the probabilities will sum up to 1. + + Note that $\alpha$ is usually set to 1. + +## Word Probabilities + +Let's assume we have the following table of word frequencies (as from the [feature extraction lecture](./feature_extraction.md#positive-and-negative-frequencies)): + +| $V$ | $n_{pos}$ | $n_{neg}$ | +| ---------- | --------- | --------- | +| I | 2 | 2 | +| am | 2 | 2 | +| happy | 2 | 0 | +| sad | 0 | 2 | +| because | 1 | 1 | +| love | 1 | 0 | +| hate | 0 | 1 | +| the | 1 | 1 | +| weather | 1 | 1 | +| **$\sum$** | **10** | **10** | + +Note that we added the last row to calculate the total number of words per class. +This allows us to calculate the probabilities $P(w|pos)$ and $P(w|neg)$ for each word $w$ in a class. + +Using the formula for Laplacian Smoothing with $\alpha=1$ + +$$ +P(w|c) = \frac{freq(w,c) + 1}{N_c + |V|} +$$ + +We end up with the following table: + +| $V$ | $P(w \vert pos)$ | $P(w \vert neg)$ | +| ---------- | ---------------- | ---------------- | +| I | 0.16 | 0.16 | +| am | 0.16 | 0.16 | +| happy | 0.16 | 0.05 | +| sad | 0.05 | 0.16 | +| because | 0.11 | 0.11 | +| love | 0.11 | 0.05 | +| hate | 0.05 | 0.11 | +| the | 0.11 | 0.11 | +| weather | 0.11 | 0.11 | +| **$\sum$** | **$\approx 1$** | **$\approx 1$** | + +!!! note + + Since the numbers have been rounded, the probabilities don't sum up to exactly 1. + +!!! example + + Let's calculate the probability $P(\text{happy}|\text{pos})$ of the word "happy" given that the tweet is **positive**. + + $$ + \begin{aligned} + P(\text{happy}|\text{pos}) &= \frac{freq(\text{happy},\text{pos}) + 1}{N_{pos} + |V|} \\ + &= \frac{2 + 1}{10 + 9} \\ + &= \frac{3}{19} \\ + &= 0.157894 \\ + &\approx 0.16 + \end{aligned} + $$ + + Let's calculate the probability $P(\text{happy}|\text{neg})$ of the word "happy" given that the tweet is **negative**. + + $$ + \begin{aligned} + P(\text{happy}|\text{neg}) &= \frac{freq(\text{happy},\text{neg}) + 1}{N_{neg} + |V|} \\ + &= \frac{0 + 1}{10 + 19} \\ + &= \frac{1}{19} \\ + &= 0.052631 \\ + &\approx 0.05 + \end{aligned} + $$ + +## Ratio of Probabilities + +Now that we have the probabilities $P(w|pos)$ and $P(w|neg)$ for each word $w$ in a class, we can calculate the ratio of probabilities for each word $w$ in the vocabulary: + +$$ +\frac{P(w \vert pos)}{P(w \vert neg)} +$$ + +Based on the ratio, we can make the following observations: + +- If the ratio is **greater than 1**, then the word is more likely to appear in a **positive** tweet. +- If the ratio is **less than 1**, then the word is more likely to appear in a **negative** tweet. +- If the ratio is **equal to 1**, then the word is considered **neutral** and equally likely to appear in a positive or negative tweet. + +| $V$ | $P(w \vert pos)$ | $P(w \vert neg)$ | $\frac{P(w \vert pos)}{P(w \vert neg)}$ | +| ------- | ---------------- | ---------------- | --------------------------------------- | +| I | 0.16 | 0.16 | 1.0 | +| am | 0.16 | 0.16 | 1.0 | +| happy | 0.16 | 0.05 | 3.2 | +| sad | 0.05 | 0.16 | 0.3125 | +| because | 0.11 | 0.11 | 1.0 | +| love | 0.11 | 0.05 | 2.2 | +| hate | 0.05 | 0.11 | 0.4545 | +| the | 0.11 | 0.11 | 1.0 | +| weather | 0.11 | 0.11 | 1.0 | + +!!! note + + Words that are neutral don't provide any information for classification. + +## Likelihood + +Now that we have the ratio of probabilities for each word $w$ in the vocabulary, we can calculate the probability of a tweet being positive or negative. + +To **classify a whole tweet**, we need to **multiply the ratios of probabilities** for each word in the tweet. + +This is called the **likelihood** $P(tweet|pos)$ of a tweet being positive and is calculated as follows: + +$$ +P(\text{pos}|\text{tweet}) = \prod_{i=1}^{m} \frac{P(w_i|pos)}{P(w_i|neg)} +$$ + +where + +- $m$ is the number of words in the tweet and +- $w_i$ is the $i$-th word in the tweet. + +!!! note + + - If the likelihood is greater than 1, then the tweet is more likely to be positive. + - If the likelihood is less than 1, then the tweet is more likely to be negative. + - If the likelihood is equal to 1, then the tweet is equally likely to be positive or negative and thus, neutral. + +!!! example + + Given the table above, let's see if the following tweet is positive or negative: + + > I am happy because I love ice cream + + We have the following ratios of probabilities: + + $$ + \begin{aligned} + \frac{P(\text{I}|\text{pos})}{P(\text{I}|\text{neg})} &= \frac{0.16}{0.16} = 1.0 \\ + \frac{P(\text{am}|\text{pos})}{P(\text{am}|\text{neg})} &= \frac{0.16}{0.16} = 1.0 \\ + \frac{P(\text{happy}|\text{pos})}{P(\text{happy}|\text{neg})} &= \frac{0.16}{0.05} = 3.2 \\ + \frac{P(\text{because}|\text{pos})}{P(\text{because}|\text{neg})} &= \frac{0.11}{0.11} = 1.0 \\ + \frac{P(\text{I}|\text{pos})}{P(\text{I}|\text{neg})} &= \frac{0.11}{0.05} = 2.2 \\ + \frac{P(\text{love}|\text{pos})}{P(\text{love}|\text{neg})} &= \frac{0.11}{0.11} = 1.0 \\ + \end{aligned} + $$ + + Note that the words "ice" and "cream" are not in the vocabulary, so we ignore them. + + Given these ratios, we can calculate the likelihood of the tweet being positive as follows: + + $$ + \begin{aligned} + P(\text{pos}|\text{tweet}) &= \prod_{i=1}^{m} \frac{P(w_i|pos)}{P(w_i|neg)} \\ + &= 1.0 \times 1.0 \times 3.2 \times 1.0 \times 2.2 \times 1.0 \\ + &= 7.04 + \end{aligned} + $$ + +## Prior + +The prior $P(pos)$ is the probability of a tweet being positive, **regardless** of the words in the tweet. + +The prior probability represents the probability of a particular class **before considering any features**. + +The prior is especially important when the dataset is **unbalanced**. + +In a binary classification problem, the prior for a class is calculated as follows: + +$$ +prior = \frac{N_c}{N} +$$ + +where + +- $N_c$ is the number of tweets in the class and +- $N$ is the total number of tweets. + +!!! example + + Let's assume we have the following corpus of 100 tweets: + + - 35 tweets are positive + - 65 tweets are negative + + Then the prior probability of a tweet being positive is calculated as + + $$ + prior = \frac{N_c}{N} = \frac{35}{100} = 0.35 + $$ + + and the prior probability of a tweet being negative is calculated as + + $$ + prior = \frac{N_c}{N} = \frac{65}{100} = 0.65 + $$ + +## Prior Ratio + +The prior ratio is the ratio of the prior probabilities of the two classes. + +In a binary classification problem, the prior ratio is calculated as follows: + +$$ +\text{prior ratio} = \frac{P(pos)}{P(neg)} +$$ + +!!! example + + Let's assume we have the following corpus of 100 tweets: + + - 35 tweets are positive + - 65 tweets are negative + + Then the prior ratio is calculated as + + $$ + \frac{P(pos)}{P(neg)} = \frac{0.35}{0.65} = 0.538 + $$ + +If we apply the prior to the likelihood, we get the following formula: + +$$ +P(\text{pos}|\text{tweet}) = \frac{P(pos)}{P(neg)} \times \prod_{i=1}^{m} \frac{P(w_i|pos)}{P(w_i|neg)} +$$ + +## Using Logarithms + +The likelihood is the product of many probabilities, i.e. values between 0 and 1. + +This can have several consequences, and numbers can become so **small** that computers have trouble representing them. + +This is called **numerical underflow**. + +To avoid this, we can use the logarithm instead. + +!!! tip + + Because they avoid the risk of numerical underflow and they are way more convenient to work with, **logarithms** appear throughout deep-learning and NLP. + +### Log Likelihood + +Applying the logarithm to the likelihood formula from above, we get the following formula: + +$$ +\log P(\text{pos}|\text{tweet}) = \log \frac{P(pos)}{P(neg)} + \sum_{i=1}^{m} \log \frac{P(w_i|pos)}{P(w_i|neg)} +$$ + +!!! note "Logarithm" + + Besides avoiding numerical underflow, another advantage of logarithms is that they allow us to use simpler operations, such as addition instead of multiplication. + + This is because of the following property of logarithms: + + $$ + \log (ab) = \log a + \log b + $$ + + Thus, the product changes to a sum in the formula above. + +### Log Prior Ratio + +When using the logarithm, we speak of the prior as the **log prior**, and of the prior ratio as the **log prior ratio**. + +$$ +\log \frac{P(pos)}{P(neg)} +$$ + +### Log Ratio of Probabilities + +Now, if we calculate the ratio of probabilities using the logarithm, the table above looks as follows: + +| $V$ | $P(w \vert pos)$ | $P(w \vert neg)$ | $\log \frac{P(w \vert pos)}{P(w \vert neg)}$ | +| ------- | ---------------- | ---------------- | -------------------------------------------- | +| I | 0.16 | 0.16 | 0.0 | +| am | 0.16 | 0.16 | 0.0 | +| happy | 0.16 | 0.05 | 1.163 | +| sad | 0.05 | 0.16 | -1.163 | +| because | 0.11 | 0.11 | 0.0 | +| love | 0.11 | 0.05 | 0.788 | +| hate | 0.05 | 0.11 | -0.788 | +| the | 0.11 | 0.11 | 0.0 | +| weather | 0.11 | 0.11 | 0.0 | + +!!! example + + Let's look at a single example, e.g. the word "happy". The ratio of probabilities is calculated as follows: + + $$ + \log \frac{P(\text{happy}|\text{pos})}{P(\text{happy}|\text{neg})} = \log \frac{0.16}{0.05} = \log 3.2 = 1.163 + $$ + +## Training + +For training of the Naive Bayes classifier for binary classification, we need to do the following: + +1. Calculate the [log prior ratio](#log-prior-ratio) +2. Compute the table of [word frequencies](./feature_extraction.md#positive-and-negative-frequencies) for each class +3. Compute the table of [conditional probabilities](#word-probabilities) of a word given a class using Laplacian Smoothing +4. Compute the [log ratio](#log-ratio-of-probabilities) of the conditional probabilities + +Of course, we need to apply the desired preprocessing steps before the training. + +![Naive Bayes Training](../img/naive-bayes-training.drawio.svg) + +## Prediction + +To predict a tweet using the Naive Bayes classifier for binary classification, we need to apply the likelihood formula to the tweet, and check if the log likelihood is greater than 0. + +$$ +\log \frac{P(pos)}{P(neg)} + \sum_{i=1}^{m} \log \frac{P(w_i|pos)}{P(w_i|neg)} > 0 +$$ + +So for every word in the tweet, we look up the log ratio of probabilities in our likelihood table and sum them up. Then we add the log prior ratio to the sum. + +Words that **do not appear** in the vocabulary are **ignored**. They are considered neutral and do not contribute to the log likelihood, as the model can only give a score for words that it has seen in the training data. + +![Naive Bayes Prediction](../img/naive-bayes-prediction.drawio.svg) + +!!! example + + Let's assume we have a **balanced corpus**: + + $$ + \log \frac{P(pos)}{P(neg)} = \log \frac{0.5}{0.5} = \log 1.0 = 0.0 + $$ + + Given the table above, let's see if the following tweet is positive or negative: + + > I am happy because I love ice cream + + We have the following **ratios of probabilities**: + + $$ + \begin{aligned} + \log \frac{P(\text{I}|\text{pos})}{P(\text{I}|\text{neg})} &= \log \frac{0.16}{0.16} = \log 1.0 = 0.0 \\ + \log \frac{P(\text{am}|\text{pos})}{P(\text{am}|\text{neg})} &= \log \frac{0.16}{0.16} = \log 1.0 = 0.0 \\ + \log \frac{P(\text{happy}|\text{pos})}{P(\text{happy}|\text{neg})} &= \log \frac{0.16}{0.05} = \log 3.2 = 1.163 \\ + \log \frac{P(\text{because}|\text{pos})}{P(\text{because}|\text{neg})} &= \log \frac{0.11}{0.11} = \log 1.0 = 0.0 \\ + \log \frac{P(\text{I}|\text{pos})}{P(\text{I}|\text{neg})} &= \log \frac{0.11}{0.05} = \log 2.2 = 0.788 \\ + \log \frac{P(\text{love}|\text{pos})}{P(\text{love}|\text{neg})} &= \log \frac{0.11}{0.11} = \log 1.0 = 0.0 \\ + \end{aligned} + $$ + + Note that the words "ice" and "cream" are not in the vocabulary, so we ignore them. + + Given these ratios, and considering the **log prior**, we can calculate the **log likelihood** of the tweet being positive as follows: + + $$ + \begin{aligned} + \log \frac{P(pos)}{P(neg)} + \sum_{i=1}^{m} \log \frac{P(w_i|pos)}{P(w_i|neg)} &= 0.0 + 0.0 + 0.0 + 1.163 + 0.0 + 0.788 + 0.0 \\ + &= 1.163 + 0.788 \\ + &= 1.951 + \end{aligned} + $$ + + Since $1.951 > 0$, the tweet is classified as **positive**. + + Note how only the words "happy" and "love" contribute to the log likelihood, since the other words are neutral. + +## Limitations + +Naive Bayes is a very simple but powerful classifier and doesn't require to find any hyperparameters. + +However, it has some limitations, with the most important one being the **independence assumption**. + +The independence assumption in Naive Bayes refers to the assumption that the presence or absence of a particular feature is independent of the presence or absence of any other feature, given the class label. + +In other words, Naive Bayes assumes that the features are independent of each other, which typically isn't the case in NLP. + +Some words are more likely to appear together than others, and are thus not independent. Also words can be related to the thing they describe. + +!!! example + + > It is sunny and hot in the Sahara desert. + + - the word "sunny" is more likely to appear with the word "hot" than with the word "cold" + - the word "Sahara" is more likely to appear with the word "desert" than with the word "ocean" + - the words "sunny" and "hot" are related to the word "desert" + +!!! example + + Which word to fill in the blank? + + > It is always cold and snowy in ... + + For Naive Bayes, the words "spring", "summer", "autumn" and "winter" are all equally likely, but from the context, we know that "winter" is the most obvious candidate. + +## Key Takeaways + +- Naive Bayes is a **simple but powerful** classifier that doesn't require to find any hyperparameters. +- Naive Bayes is based on Bayes Rule, which is a way to calculate the **conditional probability** $P(A|B)$, given that we know $P(B|A)$. +- By using **Logarithms**, we can avoid numerical underflow and simplify the calculations. +- For **training** a Naive Bayes classifier, we need to obtain the [log ratio of probabilities](#log-ratio-of-probabilities) for each word in the vocabulary. +- For **prediction**, we need to use those ratios to calculate the [log likelihood](#log-likelihood) of a tweet being positive or negative. +- The main limitation of Naive Bayes is the **independence assumption**, which assumes that the features are independent of each other, which typically isn't the case in NLP. +- However, because of its simplicity, Naive Bayes is often used as a **baseline** for text classification tasks and can perform surprisingly well. From 0886ee2ccad155d8b713341590bce685a7e4df95 Mon Sep 17 00:00:00 2001 From: Pascal Keilbach Date: Sun, 3 Dec 2023 11:42:52 +0100 Subject: [PATCH 3/6] add assignment for naive bayes --- notebooks/naive_bayes.ipynb | 782 ++++++++++++++++++++++++++++++ src/htwgnlp/naive_bayes.py | 174 +++++++ tests/htwgnlp/test_naive_bayes.py | 254 ++++++++++ 3 files changed, 1210 insertions(+) create mode 100644 notebooks/naive_bayes.ipynb create mode 100644 src/htwgnlp/naive_bayes.py create mode 100644 tests/htwgnlp/test_naive_bayes.py diff --git a/notebooks/naive_bayes.ipynb b/notebooks/naive_bayes.ipynb new file mode 100644 index 0000000..469c70b --- /dev/null +++ b/notebooks/naive_bayes.ipynb @@ -0,0 +1,782 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Sentiment Analysis with Naive Bayes\n", + "\n", + "In this notebook, we will explore the use of Naive Bayes for sentiment analysis.\n", + "\n", + "It is essentially the same task as in the previous assignment, except we use Naive Bayes this time." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Dataset\n", + "\n", + "We will use the same dataset as in the previous assignment, that is the NLTK tweets dataset.\n", + "\n", + "Also we will do the same train/test split as in the previous assignment." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Total number of tweets: 10000\n", + "Number of positive tweets: 5000\n", + "Number of negative tweets: 5000\n", + "Number of training samples: 8000\n", + "Number of test samples: 2000\n", + "train_data_pos: 4000\n", + "train_data_neg: 4000\n", + "test_data_pos: 1000\n", + "test_data_neg: 1000\n", + "train_data: 8000\n", + "test_data: 2000\n" + ] + } + ], + "source": [ + "from nltk.corpus import twitter_samples\n", + "\n", + "postive_tweets = twitter_samples.strings(\"positive_tweets.json\")\n", + "negative_tweets = twitter_samples.strings(\"negative_tweets.json\")\n", + "n_samples = len(postive_tweets) + len(negative_tweets)\n", + "n_pos = len(postive_tweets)\n", + "n_neg = len(negative_tweets)\n", + "\n", + "print(\"Total number of tweets: \", n_samples)\n", + "print(\"Number of positive tweets: \", n_pos)\n", + "print(\"Number of negative tweets: \", n_neg)\n", + "\n", + "n_train = int(n_samples * 0.8)\n", + "n_test = n_samples - n_train\n", + "\n", + "print(\"Number of training samples: \", n_train)\n", + "print(\"Number of test samples: \", n_test)\n", + "\n", + "n = int(n_train / 2)\n", + "\n", + "# training data\n", + "train_data_pos = postive_tweets[:n]\n", + "train_data_neg = negative_tweets[:n]\n", + "print(f\"train_data_pos: {len(train_data_pos)}\")\n", + "print(f\"train_data_neg: {len(train_data_neg)}\")\n", + "\n", + "# test data\n", + "test_data_pos = postive_tweets[n:]\n", + "test_data_neg = negative_tweets[n:]\n", + "print(f\"test_data_pos: {len(test_data_pos)}\")\n", + "print(f\"test_data_neg: {len(test_data_neg)}\")\n", + "\n", + "# build train and test datasets\n", + "train_data = train_data_pos + train_data_neg\n", + "test_data = test_data_pos + test_data_neg\n", + "print(f\"train_data: {len(train_data)}\")\n", + "print(f\"test_data: {len(test_data)}\")" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "y_train shape: (8000, 1)\n", + "y_test shape: (2000, 1)\n" + ] + } + ], + "source": [ + "import numpy as np\n", + "\n", + "# create labels\n", + "y_train = np.append(\n", + " np.ones((len(train_data_pos), 1)), np.zeros((len(train_data_neg), 1)), axis=0\n", + ")\n", + "y_test = np.append(\n", + " np.ones((len(test_data_pos), 1)), np.zeros((len(test_data_neg), 1)), axis=0\n", + ")\n", + "\n", + "print(\"y_train shape: \", y_train.shape)\n", + "print(\"y_test shape: \", y_test.shape)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Preprocessing\n", + "\n", + "We will reuse our preprocessing pipeline." + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['followfriday', 'top', 'engag', 'member', 'commun', 'week', ':)']" + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from htwgnlp.preprocessing_private import TweetProcessor\n", + "\n", + "processor = TweetProcessor()\n", + "train_data_processed = [processor.process_tweet(tweet) for tweet in train_data]\n", + "train_data_processed[0]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Training\n", + "\n", + "For training, the goal is to find the word probabilities for each class.\n", + "\n", + "Also we need the log ratio of the probabilities, which are calculated from the word probabilities." + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
01
hopeless0.0000830.000027
tmr0.0001100.000054
:(0.1012560.000054
everyth0.0004130.000300
kid0.0004680.000381
.........
umair0.0000280.000054
thoracicbridg0.0000280.000054
5minut0.0000280.000054
nonscript0.0000280.000054
soph0.0000280.000054
\n", + "

9160 rows × 2 columns

\n", + "
" + ], + "text/plain": [ + " 0 1\n", + "hopeless 0.000083 0.000027\n", + "tmr 0.000110 0.000054\n", + ":( 0.101256 0.000054\n", + "everyth 0.000413 0.000300\n", + "kid 0.000468 0.000381\n", + "... ... ...\n", + "umair 0.000028 0.000054\n", + "thoracicbridg 0.000028 0.000054\n", + "5minut 0.000028 0.000054\n", + "nonscript 0.000028 0.000054\n", + "soph 0.000028 0.000054\n", + "\n", + "[9160 rows x 2 columns]" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from htwgnlp.naive_bayes_private import NaiveBayes\n", + "\n", + "model = NaiveBayes()\n", + "\n", + "model.fit(train_data_processed, y_train)\n", + "model.word_probabilities" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "hopeless -1.109570\n", + "tmr -0.704105\n", + ":( -7.527391\n", + "everyth -0.321113\n", + "kid -0.205114\n", + " ... \n", + "umair 0.682189\n", + "thoracicbridg 0.682189\n", + "5minut 0.682189\n", + "nonscript 0.682189\n", + "soph 0.682189\n", + "Length: 9160, dtype: float64" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "model.log_ratios" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Testing\n", + "\n", + "For testing, we need to make sure to apply the same preprocessing pipeline as for training.\n", + "\n", + "Then we can calculate the log ratio of the probabilities for each class.\n", + "\n", + "This is done by the `predict` function, which returns the predicted class label." + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [], + "source": [ + "test_data_processed = [processor.process_tweet(tweet) for tweet in test_data]" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array([[1],\n", + " [1],\n", + " [1],\n", + " ...,\n", + " [0],\n", + " [0],\n", + " [0]])" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "y_pred = model.predict(test_data_processed)\n", + "y_pred" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Evaluation\n", + "\n", + "We can observe that we achieve a relatively high accuracy of 99.65% on the test set.\n", + "\n", + "```\n", + "# expected output\n", + "Accuracy: 0.9965\n", + "```" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Accuracy: 99.65\n" + ] + } + ], + "source": [ + "print(f\"Accuracy: {(y_pred == y_test).mean() * 100}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now we can try to predict our own tweet." + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "tweet: ['konstanz', 'great', 'place', 'live']\n", + "prediction: [1]\n" + ] + } + ], + "source": [ + "tweet = \"Konstanz is a great place to live!\"\n", + "x_i = [processor.process_tweet(tweet)]\n", + "print(f\"tweet: {x_i[0]}\")\n", + "print(f\"prediction: {model.predict(x_i)[0]}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Error Analysis\n", + "\n", + "Finally, we can check the error cases to see where our model fails." + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "sample: 65, predicted class: [0], actual class: [1.] log likelihood: -1.4684, tweet: @jaredNOTsubway @iluvmariah @Bravotv Then that truly is a LATERAL move! Now, we all know the Queen Bee is UPWARD BOUND : ) #MovingOnUp\n", + "sample: 222, predicted class: [0], actual class: [1.] log likelihood: -1.0290, tweet: A new report talks about how we burn more calories in the cold, because we work harder to warm up. Feel any better about the weather? :p\n", + "sample: 753, predicted class: [0], actual class: [1.] log likelihood: -0.9607, tweet: off to the park to get some sunlight : )\n", + "sample: 822, predicted class: [0], actual class: [1.] log likelihood: -0.4665, tweet: @msarosh Uff Itna Miss karhy thy ap :p\n", + "sample: 1057, predicted class: [1], actual class: [0.] log likelihood: 0.7028, tweet: @rcdlccom hello, any info about possible interest in Jonathas ?? He is close to join Betis :( greatings\n", + "sample: 1298, predicted class: [1], actual class: [0.] log likelihood: 1.9149, tweet: @phenomyoutube u probs had more fun with david than me : (\n", + "sample: 1544, predicted class: [1], actual class: [0.] log likelihood: 1.3753, tweet: pats jay : (\n" + ] + } + ], + "source": [ + "error_cases = np.nonzero((y_pred.flatten() != y_test.flatten()))[0]\n", + "y_prob = model.predict_prob(test_data_processed)\n", + "\n", + "for i in error_cases:\n", + " print(\n", + " f\"sample: {i:>4}, predicted class: {y_pred[i]}, actual class: {y_test[i]} log likelihood: {y_prob[i].item():7.4f}, tweet: {test_data[i]}\"\n", + " )" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "To better understand our classifier, we can check which words have the most impact on the sentiment of the review.\n", + "\n", + "We can use the log ratios of the conditional probabilities to find the words that are most indicative of a positive or negative tweet.\n", + "\n", + "Remember from the lecture that a value greater than 0 means that the word is more likely to appear in a positive tweet, and a value less than 0 means that the word is more likely to appear in a negative tweet." + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + ":) 6.883712\n", + ":-) 6.304400\n", + ":d 6.250534\n", + ":p 4.652481\n", + "stat 3.940286\n", + "bam 3.795705\n", + "warsaw 3.795705\n", + "blog 3.321247\n", + "fback 3.284879\n", + "followfriday 3.167096\n", + "dtype: float64" + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "model.log_ratios.sort_values(ascending=False).head(10)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Looking at the counts may give us a better intuition." + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
01ratio
:)22960987.0
:-)0552553.0
:d0523524.0
:p0105106.0
stat05152.0
warsaw04445.0
bam04445.0
blog02728.0
fback02627.0
followfriday02324.0
\n", + "
" + ], + "text/plain": [ + " 0 1 ratio\n", + ":) 2 2960 987.0\n", + ":-) 0 552 553.0\n", + ":d 0 523 524.0\n", + ":p 0 105 106.0\n", + "stat 0 51 52.0\n", + "warsaw 0 44 45.0\n", + "bam 0 44 45.0\n", + "blog 0 27 28.0\n", + "fback 0 26 27.0\n", + "followfriday 0 23 24.0" + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df = model.df_freqs.copy()\n", + "\n", + "df[\"ratio\"] = (df[1] + 1) / (df[0] + 1)\n", + "df.sort_values(by=\"ratio\", ascending=False).head(10)" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
01ratio
:(367510.000544
:-(38600.002584
21000.004739
21000.004739
>:(4300.022727
justi̇n3500.027778
wi̇ll3500.027778
beli̇ev3500.027778
see3500.027778
me3500.027778
\n", + "
" + ], + "text/plain": [ + " 0 1 ratio\n", + ":( 3675 1 0.000544\n", + ":-( 386 0 0.002584\n", + "》 210 0 0.004739\n", + "♛ 210 0 0.004739\n", + ">:( 43 0 0.022727\n", + "justi̇n 35 0 0.027778\n", + "wi̇ll 35 0 0.027778\n", + "beli̇ev 35 0 0.027778\n", + "see 35 0 0.027778\n", + "me 35 0 0.027778" + ] + }, + "execution_count": 13, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.sort_values(by=\"ratio\").head(10)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Conclusion\n", + "\n", + "The Naive Bayes classifier is a simple but powerful classifier that works well on text classification problems. \n", + "\n", + "It makes the assumption that the features are conditionally independent given the class, which is not true in general, but it still performs well in practice.\n" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": ".venv", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.11" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/src/htwgnlp/naive_bayes.py b/src/htwgnlp/naive_bayes.py new file mode 100644 index 0000000..42e778f --- /dev/null +++ b/src/htwgnlp/naive_bayes.py @@ -0,0 +1,174 @@ +"""Naive Bayes classifier for NLP. + +This module contains the NaiveBayes class for NLP tasks. + +Implementing this module is the 3rd assignment of the course. You can find your tasks by searching for `TODO ASSIGNMENT-3` comments. + +Hints: +- Find more information about the Python property decorator [here](https://www.programiz.com/python-programming/property) +- To build the word frequencies, you can use the [Counter](https://docs.python.org/3/library/collections.html#collections.Counter) class from Python's collections module +- you may also find the Python [zip](https://docs.python.org/3/library/functions.html#zip) function useful. +- for prediction, you may find the [intersection](https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.Index.intersection.html) method of the pandas Index class useful. + +""" + + +from collections import Counter + +import numpy as np +import pandas as pd + + +class NaiveBayes: + """Naive Bayes classifier for NLP tasks. + + This class implements a Naive Bayes classifier for NLP tasks. + It can be used for binary classification tasks. + + Attributes: + word_probabilities (pd.DataFrame): the word probabilities per class, None before training + df_freqs (pd.DataFrame): the word frequencies per class, None before training + log_ratios (pd.Series): the log ratios of the word probabilities, None before training + logprior (float): the logprior of the model, 0 before training + alpha (float): the smoothing parameter of the model + """ + + def __init__(self, alpha: float = 1.0) -> None: + """Initializes the NaiveBayes class. + + The init method accepts one hyperparameter as an optional argument, the smoothing parameter alpha. + + Args: + alpha (float, optional): the smoothing parameter. Defaults to 1.0. + """ + # TODO ASSIGNMENT-3: implement this method + raise NotImplementedError("This method needs to be implemented.") + + @property + def logprior(self) -> float: + """Returns the logprior. + + Returns: + float: the logprior + """ + # TODO ASSIGNMENT-3: implement this method + raise NotImplementedError("This method needs to be implemented.") + + @logprior.setter + def logprior(self, y: np.ndarray) -> None: + """Sets the logprior. + + Note that `y` must contain both classes. + + Args: + y (np.ndarray): a numpy array of class labels of shape (m, 1), where m is the number of samples + """ + # TODO ASSIGNMENT-3: implement this method + raise NotImplementedError("This method needs to be implemented.") + + def _get_word_frequencies(self, X: list[list[str]], y: np.ndarray) -> None: + """Computes the word frequencies per class. + + For a given list of tokenized text and a numpy array of class labels, the method computes the word frequencies for each class and stores them as a pandas DataFrame in the `df_freqs` attribute. + + In pandas, if a word does not occur in a class, the frequency should be set to 0, and not to NaN. Also make sure that the frequencies are of type int. + + Note that the this implementation of Naive Bayes is designed for binary classification. + + Args: + X (list[list[str]]): a list of tokenized text samples of length m, where m is the number of samples. + y (np.ndarray): a numpy array of class labels of shape (m, 1), where m is the number of samples. + """ + # TODO ASSIGNMENT-3: implement this method + raise NotImplementedError("This method needs to be implemented.") + + def _get_word_probabilities(self) -> None: + """Computes the conditional probabilities of a word given a class using Laplacian Smoothing. + + Based on the word frequencies, the method computes the conditional probabilities for a word given its class and stores them in the `word_probabilities` attribute. + """ + # TODO ASSIGNMENT-3: implement this method + raise NotImplementedError("This method needs to be implemented.") + + def _get_log_ratios(self) -> None: + """Computes the log ratio of the conditional probabilities. + + Based on the word probabilities, the method computes the log ratios and stores them in the `log_ratios` attribute. + """ + # TODO ASSIGNMENT-3: implement this method + raise NotImplementedError("This method needs to be implemented.") + + def fit(self, X: list[list[str]], y: np.ndarray) -> None: + """Fits a Naive Bayes model for the given text samples and labels. + + Before training naive bayes, a couple of assertions are performed to check the validity of the input data: + - The number of text samples and labels must be equal. + - y must be a 2-dimensional array. + - y must be a column vector. + + if all assertions pass, the method calls the Naive Bayes training method is executed. + + Args: + X (list[list[str]]): a list of tokenized text samples of length m, where m is the number of samples + y (np.ndarray): a numpy array of class labels of shape (m, 1), where m is the number of samples + """ + # TODO ASSIGNMENT-3: implement this method + raise NotImplementedError("This method needs to be implemented.") + + def _train_naive_bayes(self, X: list[list[str]], y: np.ndarray) -> None: + """Trains a Naive Bayes model for the given text samples and labels. + + Training is done in four steps: + - Compute the log prior ratio + - Compute the word frequencies + - Compute the word probabilities of a word given a class using Laplacian Smoothing + - Compute the log ratios + + Args: + X (list[list[str]]): a list of tokenized text samples of length m, where m is the number of samples + y (np.ndarray): a numpy array of class labels of shape (m, 1), where m is the number of samples + """ + # TODO ASSIGNMENT-3: implement this method + raise NotImplementedError("This method needs to be implemented.") + + def predict(self, X: list[list[str]]) -> np.ndarray: + """Predicts the class labels for the given text samples. + + The class labels are returned as a column vector, where each entry represents the class label of the corresponding sample. + + Args: + X (list[list[str]]): a list of tokenized text samples of length m, where m is the number of samples + + Returns: + np.ndarray: a numpy array of class labels of shape (m, 1), where m is the number of samples + """ + # TODO ASSIGNMENT-3: implement this method + raise NotImplementedError("This method needs to be implemented.") + + def predict_prob(self, X: list[list[str]]) -> np.ndarray: + """Calculates the log likelihoods for the given text samples. + + The class probabilities are returned as a column vector, where each entry represents the probability of the corresponding sample. + + Args: + X (list[list[str]]): a list of tokenized text samples of length m, where m is the number of samples + + Returns: + np.ndarray: a numpy array of class probabilities of shape (m, 1), where m is the number of samples + """ + # TODO ASSIGNMENT-3: implement this method + raise NotImplementedError("This method needs to be implemented.") + + def predict_single(self, x: list[str]) -> float: + """Calculates the log likelihood for a single text sample. + + Words that are not in the vocabulary are ignored. + + Args: + x (list[str]): a tokenized text sample + + Returns: + float: the log likelihood of the text sample + """ + # TODO ASSIGNMENT-3: implement this method + raise NotImplementedError("This method needs to be implemented.") diff --git a/tests/htwgnlp/test_naive_bayes.py b/tests/htwgnlp/test_naive_bayes.py new file mode 100644 index 0000000..b7057ad --- /dev/null +++ b/tests/htwgnlp/test_naive_bayes.py @@ -0,0 +1,254 @@ +from contextlib import nullcontext as does_not_raise + +import numpy as np +import pandas as pd +import pytest + +from htwgnlp.naive_bayes import NaiveBayes + +model = NaiveBayes() + +train_data = { + "samples": [ + ["I", "am", "happy"], + ["I", "am", "sad"], + ["I", "am", "happy", "because", "I", "love", "the", "weather"], + ["I", "am", "sad", "because", "I", "hate", "the", "weather"], + ], + "labels": np.array([1, 0, 1, 0]).reshape(-1, 1), +} + +# test data contains a list of samples, the corresponding expected labels, and expected values for log likelihoods +# type: ignore +test_data_dict: dict[str, list] = { + "samples": [ + ["foo", "bar", "love"], + ["foo", "bar", "hate"], + ["foo", "bar", "baz"], + ["happy", "love", "am"], + ["sad", "hate", "am"], + ["happy", "love", "sad"], + ["happy", "hate", "sad"], + ["the", "am", "weather"], + ], + "labels": [1, 0, 0, 1, 0, 1, 0, 0], + "expected": [ + np.log(0.1 / 0.05), + np.log(0.05 / 0.1), + 0.0, + (np.log(0.15 / 0.05) + np.log(0.1 / 0.05) + 0.0), + (np.log(0.05 / 0.15) + np.log(0.05 / 0.1) + 0.0), + (np.log(0.15 / 0.05) + np.log(0.1 / 0.05) + np.log(0.05 / 0.15)), + (np.log(0.15 / 0.05) + np.log(0.05 / 0.1) + np.log(0.05 / 0.15)), + 0.0, + ], +} + + +@pytest.fixture +def train_samples(): + return train_data["samples"] + + +@pytest.fixture +def train_samples_labels(): + return train_data["labels"] + + +@pytest.fixture +def test_samples(): + return test_data_dict["samples"] + + +@pytest.fixture +def test_samples_labels(): + return test_data_dict["labels"] + + +@pytest.fixture +def test_samples_expected(): + return test_data_dict["expected"] + + +@pytest.fixture +def trained_model(train_samples, train_samples_labels): + model._train_naive_bayes(train_samples, train_samples_labels) + return model + + +@pytest.fixture +def trained_frequencies(train_samples, train_samples_labels): + model._get_word_frequencies(train_samples, train_samples_labels) + return model.df_freqs + + +@pytest.fixture +def trained_likelihoods(train_samples, train_samples_labels): + model._get_word_frequencies(train_samples, train_samples_labels) + model._get_word_probabilities() + return model.word_probabilities + + +@pytest.fixture +def trained_log_ratios(train_samples, train_samples_labels): + model._get_word_frequencies(train_samples, train_samples_labels) + model._get_word_probabilities() + model._get_log_ratios() + return model.log_ratios + + +@pytest.mark.parametrize( + "labels, expected", + [ + (np.array([1, 1, 1, 1, 0, 0, 0, 0]), 0.0), + (np.array([1, 1, 1, 1, 1, 1, 0, 0]), np.log(6) - np.log(2)), + (np.array([0, 0, 0, 0, 0, 0, 1, 1]), np.log(2) - np.log(6)), + ], +) +def test_set_logprior(labels, expected): + model.logprior = labels + assert model.logprior == expected + + +@pytest.mark.parametrize( + "labels, expectation", + [ + (np.array([0, 0, 0, 0, 0, 0, 0, 0]), pytest.raises(AssertionError)), + (np.array([1, 1, 1, 1, 1, 1, 1, 1]), pytest.raises(AssertionError)), + (np.array([1, 1, 1, 1, 0, 0, 0, 0]), does_not_raise()), + ], +) +def test_set_logprior_exception(labels, expectation): + with expectation: + model.logprior = labels + + +def test_get_word_frequencies(trained_frequencies): + assert trained_frequencies.index.size == 9 + assert trained_frequencies.loc["happy", 1] == 2 + assert trained_frequencies.loc["happy", 0] == 0 + assert trained_frequencies.loc["sad", 1] == 0 + assert trained_frequencies.loc["sad", 0] == 2 + assert trained_frequencies.loc["weather", 1] == 1 + assert trained_frequencies.loc["weather", 0] == 1 + assert trained_frequencies.loc["love", 1] == 1 + assert trained_frequencies.loc["love", 0] == 0 + assert trained_frequencies.loc["hate", 1] == 0 + assert trained_frequencies.loc["hate", 0] == 1 + assert trained_frequencies.loc["I", 1] == 3 + assert trained_frequencies.loc["I", 0] == 3 + assert trained_frequencies.loc["am", 1] == 2 + assert trained_frequencies.loc["am", 0] == 2 + assert trained_frequencies.loc["because", 1] == 1 + assert trained_frequencies.loc["because", 0] == 1 + assert trained_frequencies.loc["the", 1] == 1 + assert trained_frequencies.loc["the", 0] == 1 + + +def test_get_word_probabilities(trained_likelihoods): + assert trained_likelihoods.index.size == 9 + assert trained_likelihoods.loc["happy", 1] == 0.15 + assert trained_likelihoods.loc["happy", 0] == 0.05 + assert trained_likelihoods.loc["sad", 1] == 0.05 + assert trained_likelihoods.loc["sad", 0] == 0.15 + assert trained_likelihoods.loc["weather", 1] == 0.1 + assert trained_likelihoods.loc["weather", 0] == 0.1 + assert trained_likelihoods.loc["love", 1] == 0.1 + assert trained_likelihoods.loc["love", 0] == 0.05 + assert trained_likelihoods.loc["hate", 1] == 0.05 + assert trained_likelihoods.loc["hate", 0] == 0.1 + assert trained_likelihoods.loc["I", 1] == 0.2 + assert trained_likelihoods.loc["I", 0] == 0.2 + assert trained_likelihoods.loc["am", 1] == 0.15 + assert trained_likelihoods.loc["am", 0] == 0.15 + assert trained_likelihoods.loc["because", 1] == 0.1 + assert trained_likelihoods.loc["because", 0] == 0.1 + assert trained_likelihoods.loc["the", 1] == 0.1 + assert trained_likelihoods.loc["the", 0] == 0.1 + + +def test_get_log_ratios(trained_log_ratios): + assert isinstance(trained_log_ratios, pd.Series) + assert trained_log_ratios.index.size == 9 + + assert trained_log_ratios.loc["happy"] == np.log(0.15 / 0.05) + assert trained_log_ratios.loc["sad"] == np.log(0.05 / 0.15) + assert trained_log_ratios.loc["love"] == np.log(0.1 / 0.05) + assert trained_log_ratios.loc["hate"] == np.log(0.05 / 0.1) + assert trained_log_ratios.loc["weather"] == 0.0 + assert trained_log_ratios.loc["I"] == 0.0 + assert trained_log_ratios.loc["am"] == 0.0 + assert trained_log_ratios.loc["because"] == 0.0 + assert trained_log_ratios.loc["the"] == 0.0 + + +@pytest.mark.parametrize( + "X, y, expectation", + [ + ( + [["I", "am", "happy"], ["I", "am", "sad"], ["NLP", "is", "fun"]], + np.array([1, 0, 1, 0]).reshape(-1, 1), + pytest.raises(AssertionError), + ), + ( + [["I", "am", "happy"], ["I", "am", "sad"], ["NLP", "is", "fun"]], + np.array([1, 0, 1]), + pytest.raises(AssertionError), + ), + ( + [["I", "am", "happy"], ["I", "am", "sad"], ["NLP", "is", "fun"]], + np.array([1, 0, 1]).reshape(1, -1), + pytest.raises(AssertionError), + ), + ( + [["I", "am", "happy"], ["I", "am", "sad"], ["NLP", "is", "fun"]], + np.array([1, 0, 1]).reshape(-1, 1), + does_not_raise(), + ), + ], +) +def test_fit(X, y, expectation): + with expectation: + model = NaiveBayes() + model.fit(X, y) + + +def test_train_naive_bayes(trained_model): + assert trained_model.logprior == 0.0 + + assert trained_model.df_freqs.index.size == 9 + assert trained_model.df_freqs.columns.size == 2 + assert trained_model.df_freqs.select_dtypes(include=["int64"]).columns.size == 2 + + assert trained_model.word_probabilities.index.size == 9 + assert trained_model.word_probabilities.columns.size == 2 + assert ( + trained_model.word_probabilities.select_dtypes(include=["float64"]).columns.size + == 2 + ) + + +@pytest.mark.parametrize( + "test_sample, expected", + [pair for pair in zip(test_data_dict["samples"], test_data_dict["expected"])], +) +def test_predict_single(trained_model, test_sample, expected): + y_pred = trained_model.predict_single(test_sample) + + np.testing.assert_allclose(y_pred, expected) + + +def test_predict_prob(trained_model, test_samples, test_samples_expected): + y_pred = trained_model.predict_prob(test_samples) + + assert isinstance(y_pred, np.ndarray) + assert y_pred.shape == (8, 1) + np.testing.assert_allclose(y_pred, np.array(test_samples_expected).reshape(-1, 1)) + + +def test_predict(trained_model, test_samples, test_samples_labels): + y_pred = trained_model.predict(test_samples) + + assert isinstance(y_pred, np.ndarray) + assert y_pred.shape == (8, 1) + np.testing.assert_array_equal(y_pred, np.array(test_samples_labels).reshape(-1, 1)) From 8d9d15d43e262f308c608e4b984b469909543b7f Mon Sep 17 00:00:00 2001 From: Pascal Keilbach Date: Sun, 3 Dec 2023 11:43:15 +0100 Subject: [PATCH 4/6] add assignment for naive bayes --- Makefile | 3 +++ 1 file changed, 3 insertions(+) diff --git a/Makefile b/Makefile index bd69c97..e303898 100644 --- a/Makefile +++ b/Makefile @@ -30,3 +30,6 @@ assignment_1: assignment_2: .venv/bin/pytest tests/htwgnlp/test_features.py .venv/bin/pytest tests/htwgnlp/test_logistic_regression.py + +assignment_3: + .venv/bin/pytest tests/htwgnlp/test_naive_bayes.py \ No newline at end of file From b0a2f11a6ac11f917e1781af8771c62da000c2bd Mon Sep 17 00:00:00 2001 From: Pascal Keilbach Date: Sun, 3 Dec 2023 11:43:38 +0100 Subject: [PATCH 5/6] add assignment for naive bayes --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index e303898..a5d76ba 100644 --- a/Makefile +++ b/Makefile @@ -32,4 +32,4 @@ assignment_2: .venv/bin/pytest tests/htwgnlp/test_logistic_regression.py assignment_3: - .venv/bin/pytest tests/htwgnlp/test_naive_bayes.py \ No newline at end of file + .venv/bin/pytest tests/htwgnlp/test_naive_bayes.py From cfce3e1a020ea1b7fa98d930ffd6c9358bb232cb Mon Sep 17 00:00:00 2001 From: Pascal Keilbach Date: Sun, 3 Dec 2023 12:09:45 +0100 Subject: [PATCH 6/6] rm figure --- ...bayes-conditional-probability-1.drawio.svg | 76 ------------------- 1 file changed, 76 deletions(-) delete mode 100644 docs/img/naive-bayes-conditional-probability-1.drawio.svg diff --git a/docs/img/naive-bayes-conditional-probability-1.drawio.svg b/docs/img/naive-bayes-conditional-probability-1.drawio.svg deleted file mode 100644 index 2f1ff2e..0000000 --- a/docs/img/naive-bayes-conditional-probability-1.drawio.svg +++ /dev/null @@ -1,76 +0,0 @@ - - - - - - - - - - - -
-
-
- "amazing" -
-
-
-
- - "amazing" - -
-
- - - - - - Corpus - - - - - -
-
-
- "amazing" - - - - - - - - - - - - - - - positive -
-
-
-
- - "amazing" \[\... - -
-
- -
- - - - - Text is not SVG - cannot display - - - -
\ No newline at end of file