From 0f29027433fd6d334535b002f42744a15f9afb0f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Quentin=20Gallou=C3=A9dec?=
 <quentin.gallouedec@huggingface.co>
Date: Fri, 1 Nov 2024 10:29:15 +0000
Subject: [PATCH 01/10] Bump dev version to `0.13.0.dev0`

---
 setup.py        | 2 +-
 trl/__init__.py | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)
diff --git a/setup.py b/setup.py
index 781b036cfe..89a419535d 100644
--- a/setup.py
+++ b/setup.py
@@ -73,7 +73,7 @@
 from setuptools import find_packages, setup
 
 
-__version__ = "0.12.0.dev0"  # expected format is one of x.y.z.dev0, or x.y.z.rc1 or x.y.z (no to dashes, yes to dots)
+__version__ = "0.13.0.dev0"  # expected format is one of x.y.z.dev0, or x.y.z.rc1 or x.y.z (no to dashes, yes to dots)
 
 REQUIRED_PKGS = [
     "accelerate>=0.34.0",
diff --git a/trl/__init__.py b/trl/__init__.py
index 28751f56b1..5859a5c91d 100644
--- a/trl/__init__.py
+++ b/trl/__init__.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-__version__ = "0.12.0.dev0"
+__version__ = "0.13.0.dev0"
 
 from typing import TYPE_CHECKING
 

From 54b106dd33c6f9b89a6e17271b0f27ce89c5d7d5 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Quentin=20Gallou=C3=A9dec?=
 <quentin.gallouedec@huggingface.co>
Date: Fri, 1 Nov 2024 10:31:55 +0000
Subject: [PATCH 02/10] Update version number to 0.12 in CITATION.cff

---
 CITATION.cff | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/CITATION.cff b/CITATION.cff
index 6ea064ec9d..06b7d105d6 100644
--- a/CITATION.cff
+++ b/CITATION.cff
@@ -31,4 +31,4 @@ keywords:
   - pytorch
   - transformers
 license: Apache-2.0
-version: 0.11.1
+version: 0.12

From fc1cac133fb648f3f1161868a642c88dcaa5c1bf Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Quentin=20Gallou=C3=A9dec?=
 <quentin.gallouedec@huggingface.co>
Date: Mon, 4 Nov 2024 11:41:38 +0000
Subject: [PATCH 03/10] Add publication date to blog post

---
 docs/source/index.mdx | 1 +
 1 file changed, 1 insertion(+)

diff --git a/docs/source/index.mdx b/docs/source/index.mdx
index b1de84afb1..81c271cdfd 100644
--- a/docs/source/index.mdx
+++ b/docs/source/index.mdx
@@ -39,6 +39,7 @@ Check the appropriate sections of the documentation depending on your needs:
   <div class="w-full flex flex-col space-y-4 md:space-y-0 md:grid md:grid-cols-2 md:gap-y-4 md:gap-x-5">
     <a class="!no-underline border dark:border-gray-700 p-5 rounded-lg shadow hover:shadow-lg" href="https://huggingface.co/blog/dpo_vlm">
       <img src="https://raw.githubusercontent.com/huggingface/blog/main/assets/dpo_vlm/thumbnail.png" alt="thumbnail">
+      <p class="text-gray-500 text-sm">Published on July 10, 2024</p>
       <p class="text-gray-700">Preference Optimization for Vision Language Models with TRL</p>
     </a>
     <a class="!no-underline border dark:border-gray-700 p-5 rounded-lg shadow hover:shadow-lg" href="https://huggingface.co/blog/rlhf">

From 337005d95169371935fb87f1c559c7412f8472a4 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Quentin=20Gallou=C3=A9dec?=
 <45557362+qgallouedec@users.noreply.github.com>
Date: Mon, 4 Nov 2024 14:27:37 +0100
Subject: [PATCH 04/10] =?UTF-8?q?=F0=9F=A7=BD=20Fix=20judge=20documentatio?=
 =?UTF-8?q?n=20(#2318)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* Update judge examples and documentation

* without ':'

* Clean doc

* Fix typo in example code

* Add space after Attributes

* Update attribute name in judges.py

* Add installation instructions for llm-blender library

* Update PairRMJudge attributes documentation

* Fix return type in PairRMJudge
---
 trl/trainer/judges.py | 132 +++++++++++++++++++++++++-----------------
 1 file changed, 79 insertions(+), 53 deletions(-)

diff --git a/trl/trainer/judges.py b/trl/trainer/judges.py
index e0638565c9..af56ec3d9b 100644
--- a/trl/trainer/judges.py
+++ b/trl/trainer/judges.py
@@ -76,7 +76,7 @@ class BaseRankJudge(ABC):
     """
     Base class for LLM ranking judges.
 
-    Example:
+    **Example**:
     ```python
     class MyRankJudge(BaseRankJudge):
         def judge(self, prompts, completions, shuffle_order=True):
@@ -96,13 +96,18 @@ def judge(self, prompts: List[str], completions: List[List[str]], shuffle_order:
         Judge the completion for the given prompts and return the ranks of each completion.
 
         Args:
-            prompts (`List[str]`): List of prompts.
-            completions (`List[List[str]]`): List of completions list, where each element is a list of completions for the corresponding prompt.
-            shuffle_order (`bool`): Whether to shuffle the order of the completions to avoid positional bias.
+            prompts (`List[str]`):
+                List of prompts.
+            completions (`List[List[str]]`):
+                List of completions list, where each element is a list of completions for the corresponding prompt.
+            shuffle_order (`bool`, *optional*, defaults to `True`):
+                Whether to shuffle the order of the completions to avoid positional bias.
 
         Returns:
-            List of lists of idxs, where each list contains the ranks of the completions for the corresponding prompt.
-            E.g., [1, 2, 0] means that the second completion (idx=1) is the best, followed by the third, and then the first.
+            `List[List[int]]`:
+                List of lists of idxs, where each list contains the ranks of the completions for the corresponding
+                prompt. E.g., `[1, 2, 0]` means that the second completion (`idx=1`) is the best, followed by the
+                third, and then the first.
         """
         raise NotImplementedError("Judge subclasses must implement the `judge` method.")
 
@@ -118,18 +123,23 @@ def judge(self, prompts: List[str], completions: List[List[str]], shuffle_order:
         Judge the completion pairs for the given prompts.
 
         Args:
-            prompts (`List[str]`): List of prompts.
-            completions (`List[List[str]]`): List of completions pairs, where each element is a pair of completions for the corresponding prompt.
-            shuffle_order (`bool`): Whether to shuffle the order of the completions to avoid positional bias.
+            prompts (`List[str]`):
+                List of prompts.
+            completions (`List[List[str]]`):
+                List of completions pairs, where each element is a pair of completions for the corresponding prompt.
+            shuffle_order (`bool`, *optional*, defaults to `True`):
+                Whether to shuffle the order of the completions to avoid positional bias.
 
         Returns:
-            List of idxs, where each idx is the rank of the best completion for the corresponding prompt.
-            E.g., 1 means that the second completion (idx=1) is the best.
+            `List[int]`:
+                List of idxs, where each idx is the rank of the best completion for the corresponding prompt.
+                E.g., `1` means that the second completion (`idx=1`) is the best.
 
         Note:
-            If the judge returns -1 for any prompt, it indicates that the inner process used to compute the preference has failed.
-            For instance, this could occur if the underlying language model returned an invalid answer.
-            In such cases, the caller should handle these invalid indices appropriately, possibly by implementing fallback logic or error handling.
+            If the judge returns `-1` for any prompt, it indicates that the inner process used to compute the
+            preference has failed. For instance, this could occur if the underlying language model returned an invalid
+            answer. In such cases, the caller should handle these invalid indices appropriately, possibly by
+            implementing fallback logic or error handling.
         """
         raise NotImplementedError("Judge subclasses must implement the `judge` method.")
 
@@ -157,30 +167,34 @@ class PairRMJudge(BasePairwiseJudge):
     """
     LLM judge based on the PairRM model from AllenAI.
 
-    This judge uses the PairRM model to rank pairs of completions for given prompts.
-    It's designed for pairwise comparison of language model outputs.
-
-    The PairRM model is loaded using the llm-blender library and runs on the
+    This judge uses the PairRM model to rank pairs of completions for given prompts. It's designed for pairwise
+    comparison of language model outputs. The PairRM model is loaded using the llm-blender library and runs on the
     default Accelerator device.
 
-    Attributes:
-        blender (llm_blender.Blender): An instance of the Blender class from llm-blender.
+    **Attributes**:
+
+        blender (`llm_blender.Blender`):
+            An instance of the Blender class from llm-blender.
+
+    **Example**:
+    ```python
+    >>> pairrm_judge = PairRMJudge()
+    >>> prompts = ["Translate 'hello' to French", "What's the capital of Japan?"]
+    >>> completions = [["Bonjour", "Salut"], ["Kyoto", "Tokyo"]]
+    >>> results = pairrm_judge.judge(prompts, completions)
+    >>> print(results)  # [0, 1] (indicating the first completion is preferred for the first prompt and the second)
+    ```
+
+    <Tip>
 
-    Example:
-        >>> pairrm_judge = PairRMJudge()
-        >>> prompts = ["Translate 'hello' to French", "What's the capital of Japan?"]
-        >>> completions = [["Bonjour", "Salut"], ["Kyoto", "Tokyo"]]
-        >>> results = pairrm_judge.judge(prompts, completions)
-        >>> print(results)  # [0, 1] (indicating the first completion is preferred for the first prompt and the second)
+    This class requires the llm-blender library to be installed. Install it with: `pip install llm-blender`.
 
-    Note:
-        This class requires the llm-blender library to be installed.
-        Install it with: pip install llm-blender
+    </Tip>
     """
 
     def __init__(self):
         if not is_llm_blender_available():
-            raise ValueError("llm-blender is not installed. Please install it with 'pip install llm-blender'.")
+            raise ValueError("llm-blender is not installed. Please install it with `pip install llm-blender`.")
         self.blender = llm_blender.Blender()
         self.blender.loadranker("llm-blender/PairRM", device=Accelerator().device)
 
@@ -196,25 +210,29 @@ def judge(
         Judge the completion pairs for the given prompts using the PairRM model.
 
         Args:
-            prompts (List[str]): List of prompts to judge.
-            completions (List[List[str]]): List of completion pairs for each prompt.
-            shuffle_order (bool, optional): Whether to shuffle the order of completions
-                to avoid positional bias. Defaults to True.
-            return_scores (bool, optional): If True, return probability scores instead of ranks (i.e. a soft-judge).
-                Defaults to False.
-            temperature (float, optional): Temperature for scaling logits if return_scores
-                is True. Defaults to 1.0.
+            prompts (`List[str]`):
+                List of prompts to judge.
+            completions (`List[List[str]]`):
+                List of completion pairs for each prompt.
+            shuffle_order (`bool`, *optional*, defaults to `True`):
+                Whether to shuffle the order of the completions to avoid positional bias.
+            return_scores (`bool`, *optional*, defaults to `False`):
+                If `True`, return probability scores of the first completion instead of ranks (i.e. a *soft-judge*).
+            temperature (`float`, *optional*, defaults to `1.0`):
+                Temperature for scaling logits if `return_scores` is True.
 
         Returns:
-            List[Union[int, float]]: List of ranks (0 or 1) or scores for each prompt,
-            indicating which completion is preferred or its score.
+            `Union[List[int, float]]`:
+                If `return_scores` is `False`, returns a list of ranks (`0` or `1`) for each prompt, indicating which
+                completion is preferred.
+                If `return_scores` is `True`, returns softmax probabilities for the first completion.
 
         Raises:
-            ValueError: If the number of completions per prompt is not exactly 2.
+            `ValueError`:
+                If the number of completions per prompt is not exactly 2.
 
         Note:
-            - Ranks are 0-indexed (0 means the first completion is preferred).
-            - If return_scores is True, returns softmax probabilities for the first completion.
+            Unlike llm-blender, ranks are 0-indexed (`0` means the first completion is preferred).
         """
 
         if len(completions[0]) != 2:
@@ -254,11 +272,15 @@ class HfPairwiseJudge(BasePairwiseJudge):
     This judge is relevant for assessing the quality chat models, where the completion is a response to a given prompt.
 
     Args:
-        model (`str`, *optional*): The model to use for the judge. Defaults to "meta-llama/Meta-Llama-3-70B-Instruct".
-        token (`str`, *optional*): The Hugging Face API token to use for the InferenceClient.
-        system_prompt (`str`, *optional*): The system prompt to be used for the judge. If not provided, a default prompt is used.
-            Note that the system prompt should contain the following placeholders: `{prompt}`, `{response0}`, and `{response1}`.
-            Also, the inference is called with `max_tokens=1`, consequently the system prompt should ask for a single token response.
+        model (`str`, *optional*, defaults to `"meta-llama/Meta-Llama-3-70B-Instruct"`):
+            Model to use for the judge.
+        token (`str`, *optional*):
+            Hugging Face API token to use for the [`huggingface_hub.InferenceClient`].
+        system_prompt (`str` or `None`, *optional*, defaults to `None`):
+            The system prompt to be used for the judge. If not provided, a default prompt is used. Note that the system
+            prompt should contain the following placeholders: `{prompt}`, `{response0}`, and `{response1}`. Also, the
+            inference is called with `max_tokens=1`, consequently the system prompt should ask for a single token
+            response.
     """
 
     def __init__(
@@ -306,11 +328,15 @@ class OpenAIPairwiseJudge(BasePairwiseJudge):
     This judge is relevant for assessing the quality chat models, where the completion is a response to a given prompt.
 
     Args:
-        model (`str`, *optional*): The model to use for the judge. Defaults to `"gpt-4-turbo-preview"`.
-        system_prompt (`str`, *optional*): The system prompt to be used for the judge. If not provided, a default prompt is used.
-            Note that the system prompt should contain the following placeholders: `{prompt}`, `{response0}`, and `{response1}`.
-            Also, the inference is called with `max_tokens=1`, consequently the system prompt should ask for a single token response.
-        max_requests (`int`, *optional*): The maximum number of requests to make to the OpenAI API. Defaults to 1000. If set to `None`, there is no limit.
+        model (`str`, *optional*, defaults to `"gpt-4-turbo-preview"`):
+            Model to use for the judge.
+        system_prompt (`str` or `None`, *optional*, defaults to `None`):
+            System prompt to be used for the judge. If not provided, a default prompt is used. Note that the system
+            prompt should contain the following placeholders: `{prompt}`, `{response0}`, and `{response1}`. Also, the
+            inference is called with `max_tokens=1`, consequently the system prompt should ask for a single token
+            response.
+        max_requests (`int` or `None`, *optional*, defaults to `1000`):
+            Maximum number of requests to make to the OpenAI API. If set to `None`, there is no limit.
     """
 
     def __init__(

From 7c3574c3c08a0171634a0bbd224d65135c2c1fdf Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Quentin=20Gallou=C3=A9dec?=
 <quentin.gallouedec@huggingface.co>
Date: Mon, 4 Nov 2024 13:29:56 +0000
Subject: [PATCH 05/10] =?UTF-8?q?Revert=20"=F0=9F=A7=BD=20Fix=20judge=20do?=
 =?UTF-8?q?cumentation=20(#2318)"?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This reverts commit 337005d95169371935fb87f1c559c7412f8472a4.
---
 trl/trainer/judges.py | 132 +++++++++++++++++-------------------------
 1 file changed, 53 insertions(+), 79 deletions(-)

diff --git a/trl/trainer/judges.py b/trl/trainer/judges.py
index af56ec3d9b..e0638565c9 100644
--- a/trl/trainer/judges.py
+++ b/trl/trainer/judges.py
@@ -76,7 +76,7 @@ class BaseRankJudge(ABC):
     """
     Base class for LLM ranking judges.
 
-    **Example**:
+    Example:
     ```python
     class MyRankJudge(BaseRankJudge):
         def judge(self, prompts, completions, shuffle_order=True):
@@ -96,18 +96,13 @@ def judge(self, prompts: List[str], completions: List[List[str]], shuffle_order:
         Judge the completion for the given prompts and return the ranks of each completion.
 
         Args:
-            prompts (`List[str]`):
-                List of prompts.
-            completions (`List[List[str]]`):
-                List of completions list, where each element is a list of completions for the corresponding prompt.
-            shuffle_order (`bool`, *optional*, defaults to `True`):
-                Whether to shuffle the order of the completions to avoid positional bias.
+            prompts (`List[str]`): List of prompts.
+            completions (`List[List[str]]`): List of completions list, where each element is a list of completions for the corresponding prompt.
+            shuffle_order (`bool`): Whether to shuffle the order of the completions to avoid positional bias.
 
         Returns:
-            `List[List[int]]`:
-                List of lists of idxs, where each list contains the ranks of the completions for the corresponding
-                prompt. E.g., `[1, 2, 0]` means that the second completion (`idx=1`) is the best, followed by the
-                third, and then the first.
+            List of lists of idxs, where each list contains the ranks of the completions for the corresponding prompt.
+            E.g., [1, 2, 0] means that the second completion (idx=1) is the best, followed by the third, and then the first.
         """
         raise NotImplementedError("Judge subclasses must implement the `judge` method.")
 
@@ -123,23 +118,18 @@ def judge(self, prompts: List[str], completions: List[List[str]], shuffle_order:
         Judge the completion pairs for the given prompts.
 
         Args:
-            prompts (`List[str]`):
-                List of prompts.
-            completions (`List[List[str]]`):
-                List of completions pairs, where each element is a pair of completions for the corresponding prompt.
-            shuffle_order (`bool`, *optional*, defaults to `True`):
-                Whether to shuffle the order of the completions to avoid positional bias.
+            prompts (`List[str]`): List of prompts.
+            completions (`List[List[str]]`): List of completions pairs, where each element is a pair of completions for the corresponding prompt.
+            shuffle_order (`bool`): Whether to shuffle the order of the completions to avoid positional bias.
 
         Returns:
-            `List[int]`:
-                List of idxs, where each idx is the rank of the best completion for the corresponding prompt.
-                E.g., `1` means that the second completion (`idx=1`) is the best.
+            List of idxs, where each idx is the rank of the best completion for the corresponding prompt.
+            E.g., 1 means that the second completion (idx=1) is the best.
 
         Note:
-            If the judge returns `-1` for any prompt, it indicates that the inner process used to compute the
-            preference has failed. For instance, this could occur if the underlying language model returned an invalid
-            answer. In such cases, the caller should handle these invalid indices appropriately, possibly by
-            implementing fallback logic or error handling.
+            If the judge returns -1 for any prompt, it indicates that the inner process used to compute the preference has failed.
+            For instance, this could occur if the underlying language model returned an invalid answer.
+            In such cases, the caller should handle these invalid indices appropriately, possibly by implementing fallback logic or error handling.
         """
         raise NotImplementedError("Judge subclasses must implement the `judge` method.")
 
@@ -167,34 +157,30 @@ class PairRMJudge(BasePairwiseJudge):
     """
     LLM judge based on the PairRM model from AllenAI.
 
-    This judge uses the PairRM model to rank pairs of completions for given prompts. It's designed for pairwise
-    comparison of language model outputs. The PairRM model is loaded using the llm-blender library and runs on the
-    default Accelerator device.
-
-    **Attributes**:
-
-        blender (`llm_blender.Blender`):
-            An instance of the Blender class from llm-blender.
+    This judge uses the PairRM model to rank pairs of completions for given prompts.
+    It's designed for pairwise comparison of language model outputs.
 
-    **Example**:
-    ```python
-    >>> pairrm_judge = PairRMJudge()
-    >>> prompts = ["Translate 'hello' to French", "What's the capital of Japan?"]
-    >>> completions = [["Bonjour", "Salut"], ["Kyoto", "Tokyo"]]
-    >>> results = pairrm_judge.judge(prompts, completions)
-    >>> print(results)  # [0, 1] (indicating the first completion is preferred for the first prompt and the second)
-    ```
+    The PairRM model is loaded using the llm-blender library and runs on the
+    default Accelerator device.
 
-    <Tip>
+    Attributes:
+        blender (llm_blender.Blender): An instance of the Blender class from llm-blender.
 
-    This class requires the llm-blender library to be installed. Install it with: `pip install llm-blender`.
+    Example:
+        >>> pairrm_judge = PairRMJudge()
+        >>> prompts = ["Translate 'hello' to French", "What's the capital of Japan?"]
+        >>> completions = [["Bonjour", "Salut"], ["Kyoto", "Tokyo"]]
+        >>> results = pairrm_judge.judge(prompts, completions)
+        >>> print(results)  # [0, 1] (indicating the first completion is preferred for the first prompt and the second)
 
-    </Tip>
+    Note:
+        This class requires the llm-blender library to be installed.
+        Install it with: pip install llm-blender
     """
 
     def __init__(self):
         if not is_llm_blender_available():
-            raise ValueError("llm-blender is not installed. Please install it with `pip install llm-blender`.")
+            raise ValueError("llm-blender is not installed. Please install it with 'pip install llm-blender'.")
         self.blender = llm_blender.Blender()
         self.blender.loadranker("llm-blender/PairRM", device=Accelerator().device)
 
@@ -210,29 +196,25 @@ def judge(
         Judge the completion pairs for the given prompts using the PairRM model.
 
         Args:
-            prompts (`List[str]`):
-                List of prompts to judge.
-            completions (`List[List[str]]`):
-                List of completion pairs for each prompt.
-            shuffle_order (`bool`, *optional*, defaults to `True`):
-                Whether to shuffle the order of the completions to avoid positional bias.
-            return_scores (`bool`, *optional*, defaults to `False`):
-                If `True`, return probability scores of the first completion instead of ranks (i.e. a *soft-judge*).
-            temperature (`float`, *optional*, defaults to `1.0`):
-                Temperature for scaling logits if `return_scores` is True.
+            prompts (List[str]): List of prompts to judge.
+            completions (List[List[str]]): List of completion pairs for each prompt.
+            shuffle_order (bool, optional): Whether to shuffle the order of completions
+                to avoid positional bias. Defaults to True.
+            return_scores (bool, optional): If True, return probability scores instead of ranks (i.e. a soft-judge).
+                Defaults to False.
+            temperature (float, optional): Temperature for scaling logits if return_scores
+                is True. Defaults to 1.0.
 
         Returns:
-            `Union[List[int, float]]`:
-                If `return_scores` is `False`, returns a list of ranks (`0` or `1`) for each prompt, indicating which
-                completion is preferred.
-                If `return_scores` is `True`, returns softmax probabilities for the first completion.
+            List[Union[int, float]]: List of ranks (0 or 1) or scores for each prompt,
+            indicating which completion is preferred or its score.
 
         Raises:
-            `ValueError`:
-                If the number of completions per prompt is not exactly 2.
+            ValueError: If the number of completions per prompt is not exactly 2.
 
         Note:
-            Unlike llm-blender, ranks are 0-indexed (`0` means the first completion is preferred).
+            - Ranks are 0-indexed (0 means the first completion is preferred).
+            - If return_scores is True, returns softmax probabilities for the first completion.
         """
 
         if len(completions[0]) != 2:
@@ -272,15 +254,11 @@ class HfPairwiseJudge(BasePairwiseJudge):
     This judge is relevant for assessing the quality chat models, where the completion is a response to a given prompt.
 
     Args:
-        model (`str`, *optional*, defaults to `"meta-llama/Meta-Llama-3-70B-Instruct"`):
-            Model to use for the judge.
-        token (`str`, *optional*):
-            Hugging Face API token to use for the [`huggingface_hub.InferenceClient`].
-        system_prompt (`str` or `None`, *optional*, defaults to `None`):
-            The system prompt to be used for the judge. If not provided, a default prompt is used. Note that the system
-            prompt should contain the following placeholders: `{prompt}`, `{response0}`, and `{response1}`. Also, the
-            inference is called with `max_tokens=1`, consequently the system prompt should ask for a single token
-            response.
+        model (`str`, *optional*): The model to use for the judge. Defaults to "meta-llama/Meta-Llama-3-70B-Instruct".
+        token (`str`, *optional*): The Hugging Face API token to use for the InferenceClient.
+        system_prompt (`str`, *optional*): The system prompt to be used for the judge. If not provided, a default prompt is used.
+            Note that the system prompt should contain the following placeholders: `{prompt}`, `{response0}`, and `{response1}`.
+            Also, the inference is called with `max_tokens=1`, consequently the system prompt should ask for a single token response.
     """
 
     def __init__(
@@ -328,15 +306,11 @@ class OpenAIPairwiseJudge(BasePairwiseJudge):
     This judge is relevant for assessing the quality chat models, where the completion is a response to a given prompt.
 
     Args:
-        model (`str`, *optional*, defaults to `"gpt-4-turbo-preview"`):
-            Model to use for the judge.
-        system_prompt (`str` or `None`, *optional*, defaults to `None`):
-            System prompt to be used for the judge. If not provided, a default prompt is used. Note that the system
-            prompt should contain the following placeholders: `{prompt}`, `{response0}`, and `{response1}`. Also, the
-            inference is called with `max_tokens=1`, consequently the system prompt should ask for a single token
-            response.
-        max_requests (`int` or `None`, *optional*, defaults to `1000`):
-            Maximum number of requests to make to the OpenAI API. If set to `None`, there is no limit.
+        model (`str`, *optional*): The model to use for the judge. Defaults to `"gpt-4-turbo-preview"`.
+        system_prompt (`str`, *optional*): The system prompt to be used for the judge. If not provided, a default prompt is used.
+            Note that the system prompt should contain the following placeholders: `{prompt}`, `{response0}`, and `{response1}`.
+            Also, the inference is called with `max_tokens=1`, consequently the system prompt should ask for a single token response.
+        max_requests (`int`, *optional*): The maximum number of requests to make to the OpenAI API. Defaults to 1000. If set to `None`, there is no limit.
     """
 
     def __init__(

From 919c9801d2c4133bf84573a19d60a3bdc992fdde Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Quentin=20Gallou=C3=A9dec?=
 <quentin.gallouedec@huggingface.co>
Date: Mon, 4 Nov 2024 13:48:34 +0000
Subject: [PATCH 06/10] Update blog post publication dates

---
 docs/source/index.mdx | 15 ++++++++++-----
 1 file changed, 10 insertions(+), 5 deletions(-)

diff --git a/docs/source/index.mdx b/docs/source/index.mdx
index 81c271cdfd..cc9820f4d9 100644
--- a/docs/source/index.mdx
+++ b/docs/source/index.mdx
@@ -37,29 +37,34 @@ Check the appropriate sections of the documentation depending on your needs:
 
 <div class="mt-10">
   <div class="w-full flex flex-col space-y-4 md:space-y-0 md:grid md:grid-cols-2 md:gap-y-4 md:gap-x-5">
-    <a class="!no-underline border dark:border-gray-700 p-5 rounded-lg shadow hover:shadow-lg" href="https://huggingface.co/blog/dpo_vlm">
+    <a class="!no-underline border dark:border-gray-700 p-2 rounded-lg shadow hover:shadow-lg" href="https://huggingface.co/blog/dpo_vlm">
       <img src="https://raw.githubusercontent.com/huggingface/blog/main/assets/dpo_vlm/thumbnail.png" alt="thumbnail">
       <p class="text-gray-500 text-sm">Published on July 10, 2024</p>
       <p class="text-gray-700">Preference Optimization for Vision Language Models with TRL</p>
     </a>
-    <a class="!no-underline border dark:border-gray-700 p-5 rounded-lg shadow hover:shadow-lg" href="https://huggingface.co/blog/rlhf">
+    <a class="!no-underline border dark:border-gray-700 p-2 rounded-lg shadow hover:shadow-lg" href="https://huggingface.co/blog/rlhf">
       <img src="https://raw.githubusercontent.com/huggingface/blog/main/assets/120_rlhf/thumbnail.png" alt="thumbnail">
+      <p class="text-gray-500 text-sm">Published on December 9, 2022</p>
       <p class="text-gray-700">Illustrating Reinforcement Learning from Human Feedback</p>
     </a>
-    <a class="!no-underline border dark:border-gray-700 p-5 rounded-lg shadow hover:shadow-lg" href="https://huggingface.co/blog/trl-peft">
+    <a class="!no-underline border dark:border-gray-700 p-2 rounded-lg shadow hover:shadow-lg" href="https://huggingface.co/blog/trl-peft">
       <img src="https://github.com/huggingface/blog/blob/main/assets/133_trl_peft/thumbnail.png?raw=true" alt="thumbnail">
+      <p class="text-gray-500 text-sm">Published on March 9, 2023</p>
       <p class="text-gray-700">Fine-tuning 20B LLMs with RLHF on a 24GB consumer GPU</p>
     </a>
-    <a class="!no-underline border dark:border-gray-700 p-5 rounded-lg shadow hover:shadow-lg" href="https://huggingface.co/blog/stackllama">
+    <a class="!no-underline border dark:border-gray-700 p-2 rounded-lg shadow hover:shadow-lg" href="https://huggingface.co/blog/stackllama">
       <img src="https://github.com/huggingface/blog/blob/main/assets/138_stackllama/thumbnail.png?raw=true" alt="thumbnail">
+      <p class="text-gray-500 text-sm">Published on April 5, 2023</p>
       <p class="text-gray-700">StackLLaMA: A hands-on guide to train LLaMA with RLHF</p>
    </a>
-    <a class="!no-underline border dark:border-gray-700 p-5 rounded-lg shadow hover:shadow-lg" href="https://huggingface.co/blog/dpo-trl">
+    <a class="!no-underline border dark:border-gray-700 p-2 rounded-lg shadow hover:shadow-lg" href="https://huggingface.co/blog/dpo-trl">
       <img src="https://github.com/huggingface/blog/blob/main/assets/157_dpo_trl/dpo_thumbnail.png?raw=true" alt="thumbnail">
+      <p class="text-gray-500 text-sm">Published on August 8, 2023</p>
       <p class="text-gray-700">Fine-tune Llama 2 with DPO</p>
     </a>
     <a class="!no-underline border dark:border-gray-700 p-5 rounded-lg shadow hover:shadow-lg" href="https://huggingface.co/blog/trl-ddpo">
       <img src="https://github.com/huggingface/blog/blob/main/assets/166_trl_ddpo/thumbnail.png?raw=true" alt="thumbnail">
+      <p class="text-gray-500 text-sm">Published on September 29, 2023</p>
       <p class="text-gray-700">Finetune Stable Diffusion Models with DDPO via TRL</p>
     </a>
   </div>

From 00945775bfc4bb8c327e07ace77a46c04c20ae5d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Quentin=20Gallou=C3=A9dec?=
 <quentin.gallouedec@huggingface.co>
Date: Mon, 4 Nov 2024 14:04:20 +0000
Subject: [PATCH 07/10] revert to p5

---
 docs/source/index.mdx | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/docs/source/index.mdx b/docs/source/index.mdx
index cc9820f4d9..fd2508e75f 100644
--- a/docs/source/index.mdx
+++ b/docs/source/index.mdx
@@ -37,27 +37,27 @@ Check the appropriate sections of the documentation depending on your needs:
 
 <div class="mt-10">
   <div class="w-full flex flex-col space-y-4 md:space-y-0 md:grid md:grid-cols-2 md:gap-y-4 md:gap-x-5">
-    <a class="!no-underline border dark:border-gray-700 p-2 rounded-lg shadow hover:shadow-lg" href="https://huggingface.co/blog/dpo_vlm">
+    <a class="!no-underline border dark:border-gray-700 p-5 rounded-lg shadow hover:shadow-lg" href="https://huggingface.co/blog/dpo_vlm">
       <img src="https://raw.githubusercontent.com/huggingface/blog/main/assets/dpo_vlm/thumbnail.png" alt="thumbnail">
       <p class="text-gray-500 text-sm">Published on July 10, 2024</p>
       <p class="text-gray-700">Preference Optimization for Vision Language Models with TRL</p>
     </a>
-    <a class="!no-underline border dark:border-gray-700 p-2 rounded-lg shadow hover:shadow-lg" href="https://huggingface.co/blog/rlhf">
+    <a class="!no-underline border dark:border-gray-700 p-5 rounded-lg shadow hover:shadow-lg" href="https://huggingface.co/blog/rlhf">
       <img src="https://raw.githubusercontent.com/huggingface/blog/main/assets/120_rlhf/thumbnail.png" alt="thumbnail">
       <p class="text-gray-500 text-sm">Published on December 9, 2022</p>
       <p class="text-gray-700">Illustrating Reinforcement Learning from Human Feedback</p>
     </a>
-    <a class="!no-underline border dark:border-gray-700 p-2 rounded-lg shadow hover:shadow-lg" href="https://huggingface.co/blog/trl-peft">
+    <a class="!no-underline border dark:border-gray-700 p-5 rounded-lg shadow hover:shadow-lg" href="https://huggingface.co/blog/trl-peft">
       <img src="https://github.com/huggingface/blog/blob/main/assets/133_trl_peft/thumbnail.png?raw=true" alt="thumbnail">
       <p class="text-gray-500 text-sm">Published on March 9, 2023</p>
       <p class="text-gray-700">Fine-tuning 20B LLMs with RLHF on a 24GB consumer GPU</p>
     </a>
-    <a class="!no-underline border dark:border-gray-700 p-2 rounded-lg shadow hover:shadow-lg" href="https://huggingface.co/blog/stackllama">
+    <a class="!no-underline border dark:border-gray-700 p-5 rounded-lg shadow hover:shadow-lg" href="https://huggingface.co/blog/stackllama">
       <img src="https://github.com/huggingface/blog/blob/main/assets/138_stackllama/thumbnail.png?raw=true" alt="thumbnail">
       <p class="text-gray-500 text-sm">Published on April 5, 2023</p>
       <p class="text-gray-700">StackLLaMA: A hands-on guide to train LLaMA with RLHF</p>
    </a>
-    <a class="!no-underline border dark:border-gray-700 p-2 rounded-lg shadow hover:shadow-lg" href="https://huggingface.co/blog/dpo-trl">
+    <a class="!no-underline border dark:border-gray-700 p-5 rounded-lg shadow hover:shadow-lg" href="https://huggingface.co/blog/dpo-trl">
       <img src="https://github.com/huggingface/blog/blob/main/assets/157_dpo_trl/dpo_thumbnail.png?raw=true" alt="thumbnail">
       <p class="text-gray-500 text-sm">Published on August 8, 2023</p>
       <p class="text-gray-700">Fine-tune Llama 2 with DPO</p>

From 35644e622b900093eb01d4eb61df5e7884bf0c31 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Quentin=20Gallou=C3=A9dec?=
 <quentin.gallouedec@huggingface.co>
Date: Mon, 4 Nov 2024 14:04:56 +0000
Subject: [PATCH 08/10] Update image URLs in index.mdx

---
 docs/source/index.mdx | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/docs/source/index.mdx b/docs/source/index.mdx
index fd2508e75f..143d451293 100644
--- a/docs/source/index.mdx
+++ b/docs/source/index.mdx
@@ -48,22 +48,22 @@ Check the appropriate sections of the documentation depending on your needs:
       <p class="text-gray-700">Illustrating Reinforcement Learning from Human Feedback</p>
     </a>
     <a class="!no-underline border dark:border-gray-700 p-5 rounded-lg shadow hover:shadow-lg" href="https://huggingface.co/blog/trl-peft">
-      <img src="https://github.com/huggingface/blog/blob/main/assets/133_trl_peft/thumbnail.png?raw=true" alt="thumbnail">
+      <img src="https://github.com/huggingface/blog/blob/main/assets/133_trl_peft/thumbnail.png" alt="thumbnail">
       <p class="text-gray-500 text-sm">Published on March 9, 2023</p>
       <p class="text-gray-700">Fine-tuning 20B LLMs with RLHF on a 24GB consumer GPU</p>
     </a>
     <a class="!no-underline border dark:border-gray-700 p-5 rounded-lg shadow hover:shadow-lg" href="https://huggingface.co/blog/stackllama">
-      <img src="https://github.com/huggingface/blog/blob/main/assets/138_stackllama/thumbnail.png?raw=true" alt="thumbnail">
+      <img src="https://github.com/huggingface/blog/blob/main/assets/138_stackllama/thumbnail.png" alt="thumbnail">
       <p class="text-gray-500 text-sm">Published on April 5, 2023</p>
       <p class="text-gray-700">StackLLaMA: A hands-on guide to train LLaMA with RLHF</p>
    </a>
     <a class="!no-underline border dark:border-gray-700 p-5 rounded-lg shadow hover:shadow-lg" href="https://huggingface.co/blog/dpo-trl">
-      <img src="https://github.com/huggingface/blog/blob/main/assets/157_dpo_trl/dpo_thumbnail.png?raw=true" alt="thumbnail">
+      <img src="https://github.com/huggingface/blog/blob/main/assets/157_dpo_trl/dpo_thumbnail.png" alt="thumbnail">
       <p class="text-gray-500 text-sm">Published on August 8, 2023</p>
       <p class="text-gray-700">Fine-tune Llama 2 with DPO</p>
     </a>
     <a class="!no-underline border dark:border-gray-700 p-5 rounded-lg shadow hover:shadow-lg" href="https://huggingface.co/blog/trl-ddpo">
-      <img src="https://github.com/huggingface/blog/blob/main/assets/166_trl_ddpo/thumbnail.png?raw=true" alt="thumbnail">
+      <img src="https://github.com/huggingface/blog/blob/main/assets/166_trl_ddpo/thumbnail.png" alt="thumbnail">
       <p class="text-gray-500 text-sm">Published on September 29, 2023</p>
       <p class="text-gray-700">Finetune Stable Diffusion Models with DDPO via TRL</p>
     </a>

From 460ffa1b3dbff43a4aacc9133f4287b17b87a180 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Quentin=20Gallou=C3=A9dec?=
 <quentin.gallouedec@huggingface.co>
Date: Mon, 4 Nov 2024 14:12:43 +0000
Subject: [PATCH 09/10] Sort and uniform thumbnail

---
 docs/source/index.mdx | 39 ++++++++++++++++++++++-----------------
 1 file changed, 22 insertions(+), 17 deletions(-)

diff --git a/docs/source/index.mdx b/docs/source/index.mdx
index 143d451293..31a5bd26fa 100644
--- a/docs/source/index.mdx
+++ b/docs/source/index.mdx
@@ -42,30 +42,35 @@ Check the appropriate sections of the documentation depending on your needs:
       <p class="text-gray-500 text-sm">Published on July 10, 2024</p>
       <p class="text-gray-700">Preference Optimization for Vision Language Models with TRL</p>
     </a>
-    <a class="!no-underline border dark:border-gray-700 p-5 rounded-lg shadow hover:shadow-lg" href="https://huggingface.co/blog/rlhf">
-      <img src="https://raw.githubusercontent.com/huggingface/blog/main/assets/120_rlhf/thumbnail.png" alt="thumbnail">
-      <p class="text-gray-500 text-sm">Published on December 9, 2022</p>
-      <p class="text-gray-700">Illustrating Reinforcement Learning from Human Feedback</p>
+    <a class="!no-underline border dark:border-gray-700 p-5 rounded-lg shadow hover:shadow-lg" href="https://huggingface.co/blog/putting_rl_back_in_rlhf_with_rloo">
+      <img src="https://raw.githubusercontent.com/huggingface/blog/main/assets/putting_rl_back_in_rlhf_with_rloo/thumbnail.png" alt="thumbnail">
+      <p class="text-gray-500 text-sm">Published on June 12, 2024</p>
+      <p class="text-gray-700">Putting RL back in RLHF</p>
     </a>
-    <a class="!no-underline border dark:border-gray-700 p-5 rounded-lg shadow hover:shadow-lg" href="https://huggingface.co/blog/trl-peft">
-      <img src="https://github.com/huggingface/blog/blob/main/assets/133_trl_peft/thumbnail.png" alt="thumbnail">
-      <p class="text-gray-500 text-sm">Published on March 9, 2023</p>
-      <p class="text-gray-700">Fine-tuning 20B LLMs with RLHF on a 24GB consumer GPU</p>
+    <a class="!no-underline border dark:border-gray-700 p-5 rounded-lg shadow hover:shadow-lg" href="https://huggingface.co/blog/trl-ddpo">
+      <img src="https://raw.githubusercontent.com/huggingface/blog/main/assets/166_trl_ddpo/thumbnail.png" alt="thumbnail">
+      <p class="text-gray-500 text-sm">Published on September 29, 2023</p>
+      <p class="text-gray-700">Finetune Stable Diffusion Models with DDPO via TRL</p>
+    </a>
+    <a class="!no-underline border dark:border-gray-700 p-5 rounded-lg shadow hover:shadow-lg" href="https://huggingface.co/blog/dpo-trl">
+      <img src="https://raw.githubusercontent.com/huggingface/blog/main/assets/157_dpo_trl/dpo_thumbnail.png" alt="thumbnail">
+      <p class="text-gray-500 text-sm">Published on August 8, 2023</p>
+      <p class="text-gray-700">Fine-tune Llama 2 with DPO</p>
     </a>
     <a class="!no-underline border dark:border-gray-700 p-5 rounded-lg shadow hover:shadow-lg" href="https://huggingface.co/blog/stackllama">
-      <img src="https://github.com/huggingface/blog/blob/main/assets/138_stackllama/thumbnail.png" alt="thumbnail">
+      <img src="https://raw.githubusercontent.com/huggingface/blog/main/assets/138_stackllama/thumbnail.png" alt="thumbnail">
       <p class="text-gray-500 text-sm">Published on April 5, 2023</p>
       <p class="text-gray-700">StackLLaMA: A hands-on guide to train LLaMA with RLHF</p>
    </a>
-    <a class="!no-underline border dark:border-gray-700 p-5 rounded-lg shadow hover:shadow-lg" href="https://huggingface.co/blog/dpo-trl">
-      <img src="https://github.com/huggingface/blog/blob/main/assets/157_dpo_trl/dpo_thumbnail.png" alt="thumbnail">
-      <p class="text-gray-500 text-sm">Published on August 8, 2023</p>
-      <p class="text-gray-700">Fine-tune Llama 2 with DPO</p>
+    <a class="!no-underline border dark:border-gray-700 p-5 rounded-lg shadow hover:shadow-lg" href="https://huggingface.co/blog/trl-peft">
+      <img src="https://raw.githubusercontent.com/huggingface/blog/main/assets/133_trl_peft/thumbnail.png" alt="thumbnail">
+      <p class="text-gray-500 text-sm">Published on March 9, 2023</p>
+      <p class="text-gray-700">Fine-tuning 20B LLMs with RLHF on a 24GB consumer GPU</p>
     </a>
-    <a class="!no-underline border dark:border-gray-700 p-5 rounded-lg shadow hover:shadow-lg" href="https://huggingface.co/blog/trl-ddpo">
-      <img src="https://github.com/huggingface/blog/blob/main/assets/166_trl_ddpo/thumbnail.png" alt="thumbnail">
-      <p class="text-gray-500 text-sm">Published on September 29, 2023</p>
-      <p class="text-gray-700">Finetune Stable Diffusion Models with DDPO via TRL</p>
+    <a class="!no-underline border dark:border-gray-700 p-5 rounded-lg shadow hover:shadow-lg" href="https://huggingface.co/blog/rlhf">
+      <img src="https://raw.githubusercontent.com/huggingface/blog/main/assets/120_rlhf/thumbnail.png" alt="thumbnail">
+      <p class="text-gray-500 text-sm">Published on December 9, 2022</p>
+      <p class="text-gray-700">Illustrating Reinforcement Learning from Human Feedback</p>
     </a>
   </div>
 </div>

From 63bc7321e4b21bdee6b2334bf7995c4f133e1e11 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Quentin=20Gallou=C3=A9dec?=
 <quentin.gallouedec@huggingface.co>
Date: Mon, 4 Nov 2024 14:17:18 +0000
Subject: [PATCH 10/10] Update image alignment in index.mdx

---
 docs/source/index.mdx | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/docs/source/index.mdx b/docs/source/index.mdx
index 31a5bd26fa..bdddc9b6f2 100644
--- a/docs/source/index.mdx
+++ b/docs/source/index.mdx
@@ -38,37 +38,37 @@ Check the appropriate sections of the documentation depending on your needs:
 <div class="mt-10">
   <div class="w-full flex flex-col space-y-4 md:space-y-0 md:grid md:grid-cols-2 md:gap-y-4 md:gap-x-5">
     <a class="!no-underline border dark:border-gray-700 p-5 rounded-lg shadow hover:shadow-lg" href="https://huggingface.co/blog/dpo_vlm">
-      <img src="https://raw.githubusercontent.com/huggingface/blog/main/assets/dpo_vlm/thumbnail.png" alt="thumbnail">
+      <img src="https://raw.githubusercontent.com/huggingface/blog/main/assets/dpo_vlm/thumbnail.png" alt="thumbnail" class="mt-0">
       <p class="text-gray-500 text-sm">Published on July 10, 2024</p>
       <p class="text-gray-700">Preference Optimization for Vision Language Models with TRL</p>
     </a>
     <a class="!no-underline border dark:border-gray-700 p-5 rounded-lg shadow hover:shadow-lg" href="https://huggingface.co/blog/putting_rl_back_in_rlhf_with_rloo">
-      <img src="https://raw.githubusercontent.com/huggingface/blog/main/assets/putting_rl_back_in_rlhf_with_rloo/thumbnail.png" alt="thumbnail">
+      <img src="https://raw.githubusercontent.com/huggingface/blog/main/assets/putting_rl_back_in_rlhf_with_rloo/thumbnail.png" alt="thumbnail" class="mt-0">
       <p class="text-gray-500 text-sm">Published on June 12, 2024</p>
       <p class="text-gray-700">Putting RL back in RLHF</p>
     </a>
     <a class="!no-underline border dark:border-gray-700 p-5 rounded-lg shadow hover:shadow-lg" href="https://huggingface.co/blog/trl-ddpo">
-      <img src="https://raw.githubusercontent.com/huggingface/blog/main/assets/166_trl_ddpo/thumbnail.png" alt="thumbnail">
+      <img src="https://raw.githubusercontent.com/huggingface/blog/main/assets/166_trl_ddpo/thumbnail.png" alt="thumbnail" class="mt-0">
       <p class="text-gray-500 text-sm">Published on September 29, 2023</p>
       <p class="text-gray-700">Finetune Stable Diffusion Models with DDPO via TRL</p>
     </a>
     <a class="!no-underline border dark:border-gray-700 p-5 rounded-lg shadow hover:shadow-lg" href="https://huggingface.co/blog/dpo-trl">
-      <img src="https://raw.githubusercontent.com/huggingface/blog/main/assets/157_dpo_trl/dpo_thumbnail.png" alt="thumbnail">
+      <img src="https://raw.githubusercontent.com/huggingface/blog/main/assets/157_dpo_trl/dpo_thumbnail.png" alt="thumbnail" class="mt-0">
       <p class="text-gray-500 text-sm">Published on August 8, 2023</p>
       <p class="text-gray-700">Fine-tune Llama 2 with DPO</p>
     </a>
     <a class="!no-underline border dark:border-gray-700 p-5 rounded-lg shadow hover:shadow-lg" href="https://huggingface.co/blog/stackllama">
-      <img src="https://raw.githubusercontent.com/huggingface/blog/main/assets/138_stackllama/thumbnail.png" alt="thumbnail">
+      <img src="https://raw.githubusercontent.com/huggingface/blog/main/assets/138_stackllama/thumbnail.png" alt="thumbnail" class="mt-0">
       <p class="text-gray-500 text-sm">Published on April 5, 2023</p>
       <p class="text-gray-700">StackLLaMA: A hands-on guide to train LLaMA with RLHF</p>
    </a>
     <a class="!no-underline border dark:border-gray-700 p-5 rounded-lg shadow hover:shadow-lg" href="https://huggingface.co/blog/trl-peft">
-      <img src="https://raw.githubusercontent.com/huggingface/blog/main/assets/133_trl_peft/thumbnail.png" alt="thumbnail">
+      <img src="https://raw.githubusercontent.com/huggingface/blog/main/assets/133_trl_peft/thumbnail.png" alt="thumbnail" class="mt-0">
       <p class="text-gray-500 text-sm">Published on March 9, 2023</p>
       <p class="text-gray-700">Fine-tuning 20B LLMs with RLHF on a 24GB consumer GPU</p>
     </a>
     <a class="!no-underline border dark:border-gray-700 p-5 rounded-lg shadow hover:shadow-lg" href="https://huggingface.co/blog/rlhf">
-      <img src="https://raw.githubusercontent.com/huggingface/blog/main/assets/120_rlhf/thumbnail.png" alt="thumbnail">
+      <img src="https://raw.githubusercontent.com/huggingface/blog/main/assets/120_rlhf/thumbnail.png" alt="thumbnail" class="mt-0">
       <p class="text-gray-500 text-sm">Published on December 9, 2022</p>
       <p class="text-gray-700">Illustrating Reinforcement Learning from Human Feedback</p>
     </a>