From 0f29027433fd6d334535b002f42744a15f9afb0f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Quentin=20Gallou=C3=A9dec?= Date: Fri, 1 Nov 2024 10:29:15 +0000 Subject: [PATCH 01/10] Bump dev version to `0.13.0.dev0` --- setup.py | 2 +- trl/__init__.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/setup.py b/setup.py index 781b036cfe..89a419535d 100644 --- a/setup.py +++ b/setup.py @@ -73,7 +73,7 @@ from setuptools import find_packages, setup -__version__ = "0.12.0.dev0" # expected format is one of x.y.z.dev0, or x.y.z.rc1 or x.y.z (no to dashes, yes to dots) +__version__ = "0.13.0.dev0" # expected format is one of x.y.z.dev0, or x.y.z.rc1 or x.y.z (no to dashes, yes to dots) REQUIRED_PKGS = [ "accelerate>=0.34.0", diff --git a/trl/__init__.py b/trl/__init__.py index 28751f56b1..5859a5c91d 100644 --- a/trl/__init__.py +++ b/trl/__init__.py @@ -12,7 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -__version__ = "0.12.0.dev0" +__version__ = "0.13.0.dev0" from typing import TYPE_CHECKING From 54b106dd33c6f9b89a6e17271b0f27ce89c5d7d5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Quentin=20Gallou=C3=A9dec?= Date: Fri, 1 Nov 2024 10:31:55 +0000 Subject: [PATCH 02/10] Update version number to 0.12 in CITATION.cff --- CITATION.cff | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CITATION.cff b/CITATION.cff index 6ea064ec9d..06b7d105d6 100644 --- a/CITATION.cff +++ b/CITATION.cff @@ -31,4 +31,4 @@ keywords: - pytorch - transformers license: Apache-2.0 -version: 0.11.1 +version: 0.12 From fc1cac133fb648f3f1161868a642c88dcaa5c1bf Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Quentin=20Gallou=C3=A9dec?= Date: Mon, 4 Nov 2024 11:41:38 +0000 Subject: [PATCH 03/10] Add publication date to blog post --- docs/source/index.mdx | 1 + 1 file changed, 1 insertion(+) diff --git a/docs/source/index.mdx b/docs/source/index.mdx index b1de84afb1..81c271cdfd 100644 --- a/docs/source/index.mdx +++ b/docs/source/index.mdx @@ -39,6 +39,7 @@ Check the appropriate sections of the documentation depending on your needs:
thumbnail +

Published on July 10, 2024

Preference Optimization for Vision Language Models with TRL

From 337005d95169371935fb87f1c559c7412f8472a4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Quentin=20Gallou=C3=A9dec?= <45557362+qgallouedec@users.noreply.github.com> Date: Mon, 4 Nov 2024 14:27:37 +0100 Subject: [PATCH 04/10] =?UTF-8?q?=F0=9F=A7=BD=20Fix=20judge=20documentatio?= =?UTF-8?q?n=20(#2318)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * Update judge examples and documentation * without ':' * Clean doc * Fix typo in example code * Add space after Attributes * Update attribute name in judges.py * Add installation instructions for llm-blender library * Update PairRMJudge attributes documentation * Fix return type in PairRMJudge --- trl/trainer/judges.py | 132 +++++++++++++++++++++++++----------------- 1 file changed, 79 insertions(+), 53 deletions(-) diff --git a/trl/trainer/judges.py b/trl/trainer/judges.py index e0638565c9..af56ec3d9b 100644 --- a/trl/trainer/judges.py +++ b/trl/trainer/judges.py @@ -76,7 +76,7 @@ class BaseRankJudge(ABC): """ Base class for LLM ranking judges. - Example: + **Example**: ```python class MyRankJudge(BaseRankJudge): def judge(self, prompts, completions, shuffle_order=True): @@ -96,13 +96,18 @@ def judge(self, prompts: List[str], completions: List[List[str]], shuffle_order: Judge the completion for the given prompts and return the ranks of each completion. Args: - prompts (`List[str]`): List of prompts. - completions (`List[List[str]]`): List of completions list, where each element is a list of completions for the corresponding prompt. - shuffle_order (`bool`): Whether to shuffle the order of the completions to avoid positional bias. + prompts (`List[str]`): + List of prompts. + completions (`List[List[str]]`): + List of completions list, where each element is a list of completions for the corresponding prompt. + shuffle_order (`bool`, *optional*, defaults to `True`): + Whether to shuffle the order of the completions to avoid positional bias. Returns: - List of lists of idxs, where each list contains the ranks of the completions for the corresponding prompt. - E.g., [1, 2, 0] means that the second completion (idx=1) is the best, followed by the third, and then the first. + `List[List[int]]`: + List of lists of idxs, where each list contains the ranks of the completions for the corresponding + prompt. E.g., `[1, 2, 0]` means that the second completion (`idx=1`) is the best, followed by the + third, and then the first. """ raise NotImplementedError("Judge subclasses must implement the `judge` method.") @@ -118,18 +123,23 @@ def judge(self, prompts: List[str], completions: List[List[str]], shuffle_order: Judge the completion pairs for the given prompts. Args: - prompts (`List[str]`): List of prompts. - completions (`List[List[str]]`): List of completions pairs, where each element is a pair of completions for the corresponding prompt. - shuffle_order (`bool`): Whether to shuffle the order of the completions to avoid positional bias. + prompts (`List[str]`): + List of prompts. + completions (`List[List[str]]`): + List of completions pairs, where each element is a pair of completions for the corresponding prompt. + shuffle_order (`bool`, *optional*, defaults to `True`): + Whether to shuffle the order of the completions to avoid positional bias. Returns: - List of idxs, where each idx is the rank of the best completion for the corresponding prompt. - E.g., 1 means that the second completion (idx=1) is the best. + `List[int]`: + List of idxs, where each idx is the rank of the best completion for the corresponding prompt. + E.g., `1` means that the second completion (`idx=1`) is the best. Note: - If the judge returns -1 for any prompt, it indicates that the inner process used to compute the preference has failed. - For instance, this could occur if the underlying language model returned an invalid answer. - In such cases, the caller should handle these invalid indices appropriately, possibly by implementing fallback logic or error handling. + If the judge returns `-1` for any prompt, it indicates that the inner process used to compute the + preference has failed. For instance, this could occur if the underlying language model returned an invalid + answer. In such cases, the caller should handle these invalid indices appropriately, possibly by + implementing fallback logic or error handling. """ raise NotImplementedError("Judge subclasses must implement the `judge` method.") @@ -157,30 +167,34 @@ class PairRMJudge(BasePairwiseJudge): """ LLM judge based on the PairRM model from AllenAI. - This judge uses the PairRM model to rank pairs of completions for given prompts. - It's designed for pairwise comparison of language model outputs. - - The PairRM model is loaded using the llm-blender library and runs on the + This judge uses the PairRM model to rank pairs of completions for given prompts. It's designed for pairwise + comparison of language model outputs. The PairRM model is loaded using the llm-blender library and runs on the default Accelerator device. - Attributes: - blender (llm_blender.Blender): An instance of the Blender class from llm-blender. + **Attributes**: + + blender (`llm_blender.Blender`): + An instance of the Blender class from llm-blender. + + **Example**: + ```python + >>> pairrm_judge = PairRMJudge() + >>> prompts = ["Translate 'hello' to French", "What's the capital of Japan?"] + >>> completions = [["Bonjour", "Salut"], ["Kyoto", "Tokyo"]] + >>> results = pairrm_judge.judge(prompts, completions) + >>> print(results) # [0, 1] (indicating the first completion is preferred for the first prompt and the second) + ``` + + - Example: - >>> pairrm_judge = PairRMJudge() - >>> prompts = ["Translate 'hello' to French", "What's the capital of Japan?"] - >>> completions = [["Bonjour", "Salut"], ["Kyoto", "Tokyo"]] - >>> results = pairrm_judge.judge(prompts, completions) - >>> print(results) # [0, 1] (indicating the first completion is preferred for the first prompt and the second) + This class requires the llm-blender library to be installed. Install it with: `pip install llm-blender`. - Note: - This class requires the llm-blender library to be installed. - Install it with: pip install llm-blender + """ def __init__(self): if not is_llm_blender_available(): - raise ValueError("llm-blender is not installed. Please install it with 'pip install llm-blender'.") + raise ValueError("llm-blender is not installed. Please install it with `pip install llm-blender`.") self.blender = llm_blender.Blender() self.blender.loadranker("llm-blender/PairRM", device=Accelerator().device) @@ -196,25 +210,29 @@ def judge( Judge the completion pairs for the given prompts using the PairRM model. Args: - prompts (List[str]): List of prompts to judge. - completions (List[List[str]]): List of completion pairs for each prompt. - shuffle_order (bool, optional): Whether to shuffle the order of completions - to avoid positional bias. Defaults to True. - return_scores (bool, optional): If True, return probability scores instead of ranks (i.e. a soft-judge). - Defaults to False. - temperature (float, optional): Temperature for scaling logits if return_scores - is True. Defaults to 1.0. + prompts (`List[str]`): + List of prompts to judge. + completions (`List[List[str]]`): + List of completion pairs for each prompt. + shuffle_order (`bool`, *optional*, defaults to `True`): + Whether to shuffle the order of the completions to avoid positional bias. + return_scores (`bool`, *optional*, defaults to `False`): + If `True`, return probability scores of the first completion instead of ranks (i.e. a *soft-judge*). + temperature (`float`, *optional*, defaults to `1.0`): + Temperature for scaling logits if `return_scores` is True. Returns: - List[Union[int, float]]: List of ranks (0 or 1) or scores for each prompt, - indicating which completion is preferred or its score. + `Union[List[int, float]]`: + If `return_scores` is `False`, returns a list of ranks (`0` or `1`) for each prompt, indicating which + completion is preferred. + If `return_scores` is `True`, returns softmax probabilities for the first completion. Raises: - ValueError: If the number of completions per prompt is not exactly 2. + `ValueError`: + If the number of completions per prompt is not exactly 2. Note: - - Ranks are 0-indexed (0 means the first completion is preferred). - - If return_scores is True, returns softmax probabilities for the first completion. + Unlike llm-blender, ranks are 0-indexed (`0` means the first completion is preferred). """ if len(completions[0]) != 2: @@ -254,11 +272,15 @@ class HfPairwiseJudge(BasePairwiseJudge): This judge is relevant for assessing the quality chat models, where the completion is a response to a given prompt. Args: - model (`str`, *optional*): The model to use for the judge. Defaults to "meta-llama/Meta-Llama-3-70B-Instruct". - token (`str`, *optional*): The Hugging Face API token to use for the InferenceClient. - system_prompt (`str`, *optional*): The system prompt to be used for the judge. If not provided, a default prompt is used. - Note that the system prompt should contain the following placeholders: `{prompt}`, `{response0}`, and `{response1}`. - Also, the inference is called with `max_tokens=1`, consequently the system prompt should ask for a single token response. + model (`str`, *optional*, defaults to `"meta-llama/Meta-Llama-3-70B-Instruct"`): + Model to use for the judge. + token (`str`, *optional*): + Hugging Face API token to use for the [`huggingface_hub.InferenceClient`]. + system_prompt (`str` or `None`, *optional*, defaults to `None`): + The system prompt to be used for the judge. If not provided, a default prompt is used. Note that the system + prompt should contain the following placeholders: `{prompt}`, `{response0}`, and `{response1}`. Also, the + inference is called with `max_tokens=1`, consequently the system prompt should ask for a single token + response. """ def __init__( @@ -306,11 +328,15 @@ class OpenAIPairwiseJudge(BasePairwiseJudge): This judge is relevant for assessing the quality chat models, where the completion is a response to a given prompt. Args: - model (`str`, *optional*): The model to use for the judge. Defaults to `"gpt-4-turbo-preview"`. - system_prompt (`str`, *optional*): The system prompt to be used for the judge. If not provided, a default prompt is used. - Note that the system prompt should contain the following placeholders: `{prompt}`, `{response0}`, and `{response1}`. - Also, the inference is called with `max_tokens=1`, consequently the system prompt should ask for a single token response. - max_requests (`int`, *optional*): The maximum number of requests to make to the OpenAI API. Defaults to 1000. If set to `None`, there is no limit. + model (`str`, *optional*, defaults to `"gpt-4-turbo-preview"`): + Model to use for the judge. + system_prompt (`str` or `None`, *optional*, defaults to `None`): + System prompt to be used for the judge. If not provided, a default prompt is used. Note that the system + prompt should contain the following placeholders: `{prompt}`, `{response0}`, and `{response1}`. Also, the + inference is called with `max_tokens=1`, consequently the system prompt should ask for a single token + response. + max_requests (`int` or `None`, *optional*, defaults to `1000`): + Maximum number of requests to make to the OpenAI API. If set to `None`, there is no limit. """ def __init__( From 7c3574c3c08a0171634a0bbd224d65135c2c1fdf Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Quentin=20Gallou=C3=A9dec?= Date: Mon, 4 Nov 2024 13:29:56 +0000 Subject: [PATCH 05/10] =?UTF-8?q?Revert=20"=F0=9F=A7=BD=20Fix=20judge=20do?= =?UTF-8?q?cumentation=20(#2318)"?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This reverts commit 337005d95169371935fb87f1c559c7412f8472a4. --- trl/trainer/judges.py | 132 +++++++++++++++++------------------------- 1 file changed, 53 insertions(+), 79 deletions(-) diff --git a/trl/trainer/judges.py b/trl/trainer/judges.py index af56ec3d9b..e0638565c9 100644 --- a/trl/trainer/judges.py +++ b/trl/trainer/judges.py @@ -76,7 +76,7 @@ class BaseRankJudge(ABC): """ Base class for LLM ranking judges. - **Example**: + Example: ```python class MyRankJudge(BaseRankJudge): def judge(self, prompts, completions, shuffle_order=True): @@ -96,18 +96,13 @@ def judge(self, prompts: List[str], completions: List[List[str]], shuffle_order: Judge the completion for the given prompts and return the ranks of each completion. Args: - prompts (`List[str]`): - List of prompts. - completions (`List[List[str]]`): - List of completions list, where each element is a list of completions for the corresponding prompt. - shuffle_order (`bool`, *optional*, defaults to `True`): - Whether to shuffle the order of the completions to avoid positional bias. + prompts (`List[str]`): List of prompts. + completions (`List[List[str]]`): List of completions list, where each element is a list of completions for the corresponding prompt. + shuffle_order (`bool`): Whether to shuffle the order of the completions to avoid positional bias. Returns: - `List[List[int]]`: - List of lists of idxs, where each list contains the ranks of the completions for the corresponding - prompt. E.g., `[1, 2, 0]` means that the second completion (`idx=1`) is the best, followed by the - third, and then the first. + List of lists of idxs, where each list contains the ranks of the completions for the corresponding prompt. + E.g., [1, 2, 0] means that the second completion (idx=1) is the best, followed by the third, and then the first. """ raise NotImplementedError("Judge subclasses must implement the `judge` method.") @@ -123,23 +118,18 @@ def judge(self, prompts: List[str], completions: List[List[str]], shuffle_order: Judge the completion pairs for the given prompts. Args: - prompts (`List[str]`): - List of prompts. - completions (`List[List[str]]`): - List of completions pairs, where each element is a pair of completions for the corresponding prompt. - shuffle_order (`bool`, *optional*, defaults to `True`): - Whether to shuffle the order of the completions to avoid positional bias. + prompts (`List[str]`): List of prompts. + completions (`List[List[str]]`): List of completions pairs, where each element is a pair of completions for the corresponding prompt. + shuffle_order (`bool`): Whether to shuffle the order of the completions to avoid positional bias. Returns: - `List[int]`: - List of idxs, where each idx is the rank of the best completion for the corresponding prompt. - E.g., `1` means that the second completion (`idx=1`) is the best. + List of idxs, where each idx is the rank of the best completion for the corresponding prompt. + E.g., 1 means that the second completion (idx=1) is the best. Note: - If the judge returns `-1` for any prompt, it indicates that the inner process used to compute the - preference has failed. For instance, this could occur if the underlying language model returned an invalid - answer. In such cases, the caller should handle these invalid indices appropriately, possibly by - implementing fallback logic or error handling. + If the judge returns -1 for any prompt, it indicates that the inner process used to compute the preference has failed. + For instance, this could occur if the underlying language model returned an invalid answer. + In such cases, the caller should handle these invalid indices appropriately, possibly by implementing fallback logic or error handling. """ raise NotImplementedError("Judge subclasses must implement the `judge` method.") @@ -167,34 +157,30 @@ class PairRMJudge(BasePairwiseJudge): """ LLM judge based on the PairRM model from AllenAI. - This judge uses the PairRM model to rank pairs of completions for given prompts. It's designed for pairwise - comparison of language model outputs. The PairRM model is loaded using the llm-blender library and runs on the - default Accelerator device. - - **Attributes**: - - blender (`llm_blender.Blender`): - An instance of the Blender class from llm-blender. + This judge uses the PairRM model to rank pairs of completions for given prompts. + It's designed for pairwise comparison of language model outputs. - **Example**: - ```python - >>> pairrm_judge = PairRMJudge() - >>> prompts = ["Translate 'hello' to French", "What's the capital of Japan?"] - >>> completions = [["Bonjour", "Salut"], ["Kyoto", "Tokyo"]] - >>> results = pairrm_judge.judge(prompts, completions) - >>> print(results) # [0, 1] (indicating the first completion is preferred for the first prompt and the second) - ``` + The PairRM model is loaded using the llm-blender library and runs on the + default Accelerator device. - + Attributes: + blender (llm_blender.Blender): An instance of the Blender class from llm-blender. - This class requires the llm-blender library to be installed. Install it with: `pip install llm-blender`. + Example: + >>> pairrm_judge = PairRMJudge() + >>> prompts = ["Translate 'hello' to French", "What's the capital of Japan?"] + >>> completions = [["Bonjour", "Salut"], ["Kyoto", "Tokyo"]] + >>> results = pairrm_judge.judge(prompts, completions) + >>> print(results) # [0, 1] (indicating the first completion is preferred for the first prompt and the second) - + Note: + This class requires the llm-blender library to be installed. + Install it with: pip install llm-blender """ def __init__(self): if not is_llm_blender_available(): - raise ValueError("llm-blender is not installed. Please install it with `pip install llm-blender`.") + raise ValueError("llm-blender is not installed. Please install it with 'pip install llm-blender'.") self.blender = llm_blender.Blender() self.blender.loadranker("llm-blender/PairRM", device=Accelerator().device) @@ -210,29 +196,25 @@ def judge( Judge the completion pairs for the given prompts using the PairRM model. Args: - prompts (`List[str]`): - List of prompts to judge. - completions (`List[List[str]]`): - List of completion pairs for each prompt. - shuffle_order (`bool`, *optional*, defaults to `True`): - Whether to shuffle the order of the completions to avoid positional bias. - return_scores (`bool`, *optional*, defaults to `False`): - If `True`, return probability scores of the first completion instead of ranks (i.e. a *soft-judge*). - temperature (`float`, *optional*, defaults to `1.0`): - Temperature for scaling logits if `return_scores` is True. + prompts (List[str]): List of prompts to judge. + completions (List[List[str]]): List of completion pairs for each prompt. + shuffle_order (bool, optional): Whether to shuffle the order of completions + to avoid positional bias. Defaults to True. + return_scores (bool, optional): If True, return probability scores instead of ranks (i.e. a soft-judge). + Defaults to False. + temperature (float, optional): Temperature for scaling logits if return_scores + is True. Defaults to 1.0. Returns: - `Union[List[int, float]]`: - If `return_scores` is `False`, returns a list of ranks (`0` or `1`) for each prompt, indicating which - completion is preferred. - If `return_scores` is `True`, returns softmax probabilities for the first completion. + List[Union[int, float]]: List of ranks (0 or 1) or scores for each prompt, + indicating which completion is preferred or its score. Raises: - `ValueError`: - If the number of completions per prompt is not exactly 2. + ValueError: If the number of completions per prompt is not exactly 2. Note: - Unlike llm-blender, ranks are 0-indexed (`0` means the first completion is preferred). + - Ranks are 0-indexed (0 means the first completion is preferred). + - If return_scores is True, returns softmax probabilities for the first completion. """ if len(completions[0]) != 2: @@ -272,15 +254,11 @@ class HfPairwiseJudge(BasePairwiseJudge): This judge is relevant for assessing the quality chat models, where the completion is a response to a given prompt. Args: - model (`str`, *optional*, defaults to `"meta-llama/Meta-Llama-3-70B-Instruct"`): - Model to use for the judge. - token (`str`, *optional*): - Hugging Face API token to use for the [`huggingface_hub.InferenceClient`]. - system_prompt (`str` or `None`, *optional*, defaults to `None`): - The system prompt to be used for the judge. If not provided, a default prompt is used. Note that the system - prompt should contain the following placeholders: `{prompt}`, `{response0}`, and `{response1}`. Also, the - inference is called with `max_tokens=1`, consequently the system prompt should ask for a single token - response. + model (`str`, *optional*): The model to use for the judge. Defaults to "meta-llama/Meta-Llama-3-70B-Instruct". + token (`str`, *optional*): The Hugging Face API token to use for the InferenceClient. + system_prompt (`str`, *optional*): The system prompt to be used for the judge. If not provided, a default prompt is used. + Note that the system prompt should contain the following placeholders: `{prompt}`, `{response0}`, and `{response1}`. + Also, the inference is called with `max_tokens=1`, consequently the system prompt should ask for a single token response. """ def __init__( @@ -328,15 +306,11 @@ class OpenAIPairwiseJudge(BasePairwiseJudge): This judge is relevant for assessing the quality chat models, where the completion is a response to a given prompt. Args: - model (`str`, *optional*, defaults to `"gpt-4-turbo-preview"`): - Model to use for the judge. - system_prompt (`str` or `None`, *optional*, defaults to `None`): - System prompt to be used for the judge. If not provided, a default prompt is used. Note that the system - prompt should contain the following placeholders: `{prompt}`, `{response0}`, and `{response1}`. Also, the - inference is called with `max_tokens=1`, consequently the system prompt should ask for a single token - response. - max_requests (`int` or `None`, *optional*, defaults to `1000`): - Maximum number of requests to make to the OpenAI API. If set to `None`, there is no limit. + model (`str`, *optional*): The model to use for the judge. Defaults to `"gpt-4-turbo-preview"`. + system_prompt (`str`, *optional*): The system prompt to be used for the judge. If not provided, a default prompt is used. + Note that the system prompt should contain the following placeholders: `{prompt}`, `{response0}`, and `{response1}`. + Also, the inference is called with `max_tokens=1`, consequently the system prompt should ask for a single token response. + max_requests (`int`, *optional*): The maximum number of requests to make to the OpenAI API. Defaults to 1000. If set to `None`, there is no limit. """ def __init__( From 919c9801d2c4133bf84573a19d60a3bdc992fdde Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Quentin=20Gallou=C3=A9dec?= Date: Mon, 4 Nov 2024 13:48:34 +0000 Subject: [PATCH 06/10] Update blog post publication dates --- docs/source/index.mdx | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/docs/source/index.mdx b/docs/source/index.mdx index 81c271cdfd..cc9820f4d9 100644 --- a/docs/source/index.mdx +++ b/docs/source/index.mdx @@ -37,29 +37,34 @@ Check the appropriate sections of the documentation depending on your needs:
From 00945775bfc4bb8c327e07ace77a46c04c20ae5d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Quentin=20Gallou=C3=A9dec?= Date: Mon, 4 Nov 2024 14:04:20 +0000 Subject: [PATCH 07/10] revert to p5 --- docs/source/index.mdx | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/docs/source/index.mdx b/docs/source/index.mdx index cc9820f4d9..fd2508e75f 100644 --- a/docs/source/index.mdx +++ b/docs/source/index.mdx @@ -37,27 +37,27 @@ Check the appropriate sections of the documentation depending on your needs:
- + thumbnail

Published on July 10, 2024

Preference Optimization for Vision Language Models with TRL

- + thumbnail

Published on December 9, 2022

Illustrating Reinforcement Learning from Human Feedback

- + thumbnail

Published on March 9, 2023

Fine-tuning 20B LLMs with RLHF on a 24GB consumer GPU

- + thumbnail

Published on April 5, 2023

StackLLaMA: A hands-on guide to train LLaMA with RLHF

- + thumbnail

Published on August 8, 2023

Fine-tune Llama 2 with DPO

From 35644e622b900093eb01d4eb61df5e7884bf0c31 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Quentin=20Gallou=C3=A9dec?= Date: Mon, 4 Nov 2024 14:04:56 +0000 Subject: [PATCH 08/10] Update image URLs in index.mdx --- docs/source/index.mdx | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/docs/source/index.mdx b/docs/source/index.mdx index fd2508e75f..143d451293 100644 --- a/docs/source/index.mdx +++ b/docs/source/index.mdx @@ -48,22 +48,22 @@ Check the appropriate sections of the documentation depending on your needs:

Illustrating Reinforcement Learning from Human Feedback

- thumbnail + thumbnail

Published on March 9, 2023

Fine-tuning 20B LLMs with RLHF on a 24GB consumer GPU

- thumbnail + thumbnail

Published on April 5, 2023

StackLLaMA: A hands-on guide to train LLaMA with RLHF

- thumbnail + thumbnail

Published on August 8, 2023

Fine-tune Llama 2 with DPO

- thumbnail + thumbnail

Published on September 29, 2023

Finetune Stable Diffusion Models with DDPO via TRL

From 460ffa1b3dbff43a4aacc9133f4287b17b87a180 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Quentin=20Gallou=C3=A9dec?= Date: Mon, 4 Nov 2024 14:12:43 +0000 Subject: [PATCH 09/10] Sort and uniform thumbnail --- docs/source/index.mdx | 39 ++++++++++++++++++++++----------------- 1 file changed, 22 insertions(+), 17 deletions(-) diff --git a/docs/source/index.mdx b/docs/source/index.mdx index 143d451293..31a5bd26fa 100644 --- a/docs/source/index.mdx +++ b/docs/source/index.mdx @@ -42,30 +42,35 @@ Check the appropriate sections of the documentation depending on your needs:

Published on July 10, 2024

Preference Optimization for Vision Language Models with TRL

- - thumbnail -

Published on December 9, 2022

-

Illustrating Reinforcement Learning from Human Feedback

+
+ thumbnail +

Published on June 12, 2024

+

Putting RL back in RLHF

- - thumbnail -

Published on March 9, 2023

-

Fine-tuning 20B LLMs with RLHF on a 24GB consumer GPU

+
+ thumbnail +

Published on September 29, 2023

+

Finetune Stable Diffusion Models with DDPO via TRL

+
+ + thumbnail +

Published on August 8, 2023

+

Fine-tune Llama 2 with DPO

- thumbnail + thumbnail

Published on April 5, 2023

StackLLaMA: A hands-on guide to train LLaMA with RLHF

- - thumbnail -

Published on August 8, 2023

-

Fine-tune Llama 2 with DPO

+
+ thumbnail +

Published on March 9, 2023

+

Fine-tuning 20B LLMs with RLHF on a 24GB consumer GPU

- - thumbnail -

Published on September 29, 2023

-

Finetune Stable Diffusion Models with DDPO via TRL

+
+ thumbnail +

Published on December 9, 2022

+

Illustrating Reinforcement Learning from Human Feedback

From 63bc7321e4b21bdee6b2334bf7995c4f133e1e11 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Quentin=20Gallou=C3=A9dec?= Date: Mon, 4 Nov 2024 14:17:18 +0000 Subject: [PATCH 10/10] Update image alignment in index.mdx --- docs/source/index.mdx | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/docs/source/index.mdx b/docs/source/index.mdx index 31a5bd26fa..bdddc9b6f2 100644 --- a/docs/source/index.mdx +++ b/docs/source/index.mdx @@ -38,37 +38,37 @@ Check the appropriate sections of the documentation depending on your needs: