From 65c8e28a2571da33cc5f803f3f812515c5d2347b Mon Sep 17 00:00:00 2001
From: leandro <leandro.vonwerra@spoud.io>
Date: Wed, 25 Jan 2023 11:41:13 +0100
Subject: [PATCH] restructure examples

---
 docs/source/sentiment_tuning.mdx              | 378 +-----------
 examples/README.md                            |  33 ++
 .../notebooks/distilbert-imdb-training.ipynb  | 292 ---------
 examples/scripts/ppo-sentiment-adam-8bit.py   | 153 -----
 .../notebooks/gpt2-sentiment-control.ipynb    |   0
 .../notebooks/gpt2-sentiment.ipynb}           | 561 +++++-------------
 .../scripts/gpt2-sentiment.py}                |  31 +-
 .../scripts/t5-sentiment.py}                  |   0
 8 files changed, 198 insertions(+), 1250 deletions(-)
 create mode 100644 examples/README.md
 delete mode 100644 examples/notebooks/distilbert-imdb-training.ipynb
 delete mode 100644 examples/scripts/ppo-sentiment-adam-8bit.py
 rename examples/{ => sentiment}/notebooks/gpt2-sentiment-control.ipynb (100%)
 rename examples/{notebooks/gpt2-sentiment-ppo-training.ipynb => sentiment/notebooks/gpt2-sentiment.ipynb} (69%)
 rename examples/{scripts/ppo-sentiment.py => sentiment/scripts/gpt2-sentiment.py} (81%)
 rename examples/{scripts/ppo-sentiment-t5-small.py => sentiment/scripts/t5-sentiment.py} (100%)
diff --git a/docs/source/sentiment_tuning.mdx b/docs/source/sentiment_tuning.mdx
index 241d667365..699fa037b3 100644
--- a/docs/source/sentiment_tuning.mdx
+++ b/docs/source/sentiment_tuning.mdx
@@ -1,373 +1,33 @@
-# Tune GPT2 to generate positive reviews
-Optimise GPT2 to produce positive IMDB movie reviews using a BERT sentiment classifier as a reward function. 
+# Sentiment Examples
 
-Experiment setup to tune GPT2. The yellow arrows are outside the scope of this notebook, but the trained models are available through Hugging Face:
+The notebooks and scripts in this examples show how to fine-tune a model with a sentiment classifier (such as `lvwerra/distilbert-imdb`).
 
-<img src='https://huggingface.co/datasets/trl-internal-testing/example-images/resolve/main/images/gpt2_bert_training.png' width='600'/>
+Here's an overview of the notebooks and scripts in the [trl repository](https://github.com/lvwerra/trl/tree/main/examples):
 
-In this notebook we fine-tune GPT2 (small) to generate positive movie reviews based on the IMDB dataset. The model gets the start of a real review and is tasked to produce positive continuations. To reward positive continuations we use a BERT classifier to analyse the sentiment of the produced sentences and use the classifier's outputs as rewards signals for PPO training.
+| File | Description |
+|---|---|
+| `examples/notebooks/gpt2-sentiment.ipynb`  | Fine-tune GPT2 to generate positive movie reviews. |
+| `examples/notebooks/gpt2-sentiment-control.ipynb`  | Fine-tune GPT2 to generate movie reviews with controlled sentiment. |
+| `examples/scripts/gpt2-sentiment.py` | Same as the notebook, but easier to use to use in mutli-GPU setup. |
+| `examples/scripts/t5-sentiment.py` | Same as GPT2 script, but for a Seq2Seq model (T5). |
 
-## Setup experiment
 
-First we need to setup a few things: imports, configuration, and logger.
-
-## Install dependencies
+## Installation
 
 ```bash
-pip install datasets
-```
-
-### Import dependencies
-
-```python
-import torch
-import wandb
-import time
-import os
-from tqdm import tqdm
-import numpy as np
-import pandas as pd
-tqdm.pandas()
-
-from datasets import load_dataset
-
-from transformers import AutoTokenizer, pipeline
-
-from trl.gpt2 import AutoModelForCausalLMWithValueHead
-from trl.ppo import PPOTrainer
-from trl.core import build_bert_batch_from_txt, listify_batch
-```
-
-### Configuration
-
-Next we setup a few configs for the training:
-
-```python
-config = {
-    "model_name": "lvwerra/gpt2-imdb",
-    "cls_model_name": "lvwerra/distilbert-imdb",
-    "steps": 20000,
-    "batch_size": 256,
-    "forward_batch_size": 16,
-    "ppo_epochs": 4,   
-    "txt_in_min_len": 2,
-    "txt_in_max_len": 8,
-    "txt_out_min_len": 4,
-    "txt_out_max_len": 16,
-    "lr": 1.41e-5,
-    "init_kl_coef":0.2,
-    "target": 6,
-    "horizon":10000,
-    "gamma":1,
-    "lam":0.95,
-    "cliprange": .2,
-    "cliprange_value":.2,
-    "vf_coef":.1, 
-}
-```
-
-**Forward batching**: Since the models can be fairly big and we want to rollout large PPO batches this can lead to out-of-memory errors when doing the forward passes for text generation and sentiment analysis. We introduce the parameter `forward_batch_size` to split the forward passes into smaller batches. Although this hurts performance a little this is neglectible compared to the computations of the backward passes when optimizing the model. The same parameter is used in the `PPOTrainer` when doing forward passes. The `batch_size` should multiple of `forward_batch_size`.
-
-
-```python
-device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
-pipe_device = 0 if torch.cuda.is_available() else -1
-```
-
-You can see that we load a GPT2 model called `gpt2_imdb`. This model was additionally fine-tuned on the IMDB dataset for 1 epoch with the Hugging Face [script](https://github.com/huggingface/transformers/blob/master/examples/run_language_modeling.py) (no special settings). The other parameters are mostly taken from the original paper ["Fine-Tuning Language Models from Human Preferences"](
-https://arxiv.org/pdf/1909.08593.pdf). This model as well as the BERT model is available in the Hugging Face model zoo [here](https://huggingface.co/models). The following code will automatically download the models.
-
-### Initialize W&B logger
-We use `wandb`to log all the metrics during training.
-
-```python
-wandb.init(name='run-42', project='gpt2-test', config=config, )
-```
-
-## Load data and models
-
-### Load IMDB dataset
-The IMDB dataset contains 50k movie review annotated with "positive"/"negative" feedback indicating the sentiment.  We load the IMDB dataset into a DataFrame and filter for comments that are at least 500 characters long and take the first 1000 characters of each comment. The first filter we apply to avoid comments that are less than `txt_in_len` token long and the second to avoid tokenizing way more text than we actually need.
-
-
-```python
-ds = load_dataset('imdb', split='train')
-ds = ds.rename_columns({'text': 'review', 'label': 'sentiment'})
-ds = ds.filter(lambda x: len(x["review"])>200, batched=False)
-ds
-```
-
-```python
-    Dataset({
-        features: ['review', 'sentiment'],
-        num_rows: 24895
-    })
-```
-
-
-### Load BERT classifier
-We load a BERT classifier fine-tuned on the IMDB dataset.
-
-
-```python
-sent_kwargs = {
-    "return_all_scores": True,
-    "function_to_apply": "none",
-    "batch_size": config["forward_batch_size"]
-}
-
-sentiment_pipe = pipeline("sentiment-analysis","lvwerra/distilbert-imdb", device=pipe_device)
-```
-
-The model outputs are the logits for the negative and positive class. We will use the logits for positive class as a reward signal for the language model.
-
-
-```python
-text = 'this movie was really bad!!'
-sentiment_pipe(text, **sent_kwargs)
-```
-
-```python
-    [[{'label': 'NEGATIVE', 'score': 2.335048198699951},
-      {'label': 'POSITIVE', 'score': -2.726576566696167}]]
-```
-
-
-
-```python
-text = 'this movie was really good!!'
-sentiment_pipe(text, **sent_kwargs)
+pip install trl
+#optional: wandb
+pip install wandb
 ```
 
-```python
-    [[{'label': 'NEGATIVE', 'score': -2.2947897911071777},
-      {'label': 'POSITIVE', 'score': 2.557039737701416}]]
-```
-
-### Load pre-trained GPT2 language models
-
-We load the GPT2 model with a value head and the tokenizer. We load the model twice; the first model is optimized while the second model serves as a reference to calculate the KL-divergence from the starting point. This serves as an additional reward signal in the PPO training to make sure the optimized model does not deviate too much from the original language model.
-
-
-```python
-gpt2_model = AutoModelForCausalLMWithValueHead.from_pretrained(config['model_name'])
-gpt2_model_ref = AutoModelForCausalLMWithValueHead.from_pretrained(config['model_name'])
-
-gpt2_tokenizer = AutoTokenizer.from_pretrained(config['model_name'])
-gpt2_tokenizer.pad_token = gpt2_tokenizer.eos_token
-```
+Note: if you don't want to log with `wandb` remove `log_with="wandb"` in the scripts/notebooks. You can also replace it with your favourite experiment tracker that's [supported by `accelerate`](https://huggingface.co/docs/accelerate/usage_guides/tracking).
 
-### Watch model with wandb
-This wandb magic logs the gradients and weights of the model during training.
 
+## Launch scripts
 
-```python
-wandb.watch(gpt2_model, log='all')
-```
-
-### Move models to GPU
-
-If `cuda` is available move the computations to the GPU.
-
-
-```python
-gpt2_model.to(device);
-gpt2_model_ref.to(device);
-```
-
-### Tokenize IMDB reviews
-
-We want to randomize the query and response length so we introduce a `LengthSampler` that uniformly samples values from an interval.
-
-
-```python
-class LengthSampler:
-    def __init__(self, min_value, max_value):
-        self.values = list(range(min_value, max_value))
-    def __call__(self):
-        return np.random.choice(self.values)
-    
-input_size = LengthSampler(config["txt_in_min_len"], config["txt_in_max_len"])
-output_size = LengthSampler(config["txt_out_min_len"], config["txt_out_max_len"])
-```
-
-We pre-tokenize all IMDB in advance to avoid tokenizing twice. In the first step we encode the queries and slice the first `input_size()` tokens. In a second step we decode these tokens back to text for later display.
-
-
-```python
-def tokenize(sample):
-    sample["tokens"] = gpt2_tokenizer.encode(sample["review"])[:input_size()]
-    sample["query"] = gpt2_tokenizer.decode(sample["tokens"])
-    return sample
-
-ds = ds.map(tokenize, batched=False)
-```
-
-### Generation settings
-For the response generation we just use sampling and make sure top-k and nucleus sampling are turned off as well as a minimal length.
-
-
-```python
-gen_kwargs = {
-    "min_length":-1,
-    "top_k": 0.0,
-    "top_p": 1.0,
-    "do_sample": True,
-    "pad_token_id": gpt2_tokenizer.eos_token_id
-}
-```
-
-## Optimize model
-
-### Dataloader
-We use a dataloader to return the batches of queries used for each PPO epoch:
-
-
-```python
-def collator(data):
-    return dict((key, [d[key] for d in data]) for key in data[0])
-
-dataloader = torch.utils.data.DataLoader(ds, batch_size=config['batch_size'], collate_fn=collator)
-```
-
-### Training loop
-
-The training loop consists of the following main steps:
-1. Get the query responses from the policy network (GPT-2)
-2. Get sentiments for query/responses from BERT
-3. Optimize policy with PPO using the (query, response, reward) triplet
-
-**Training time**
-
-This step takes **~2h** on a V100 GPU with the above specified settings.
-
-
-```python
-ppo_trainer = PPOTrainer(gpt2_model, gpt2_model_ref, gpt2_tokenizer, **config)
-
-total_ppo_epochs = int(np.ceil(config["steps"]/config['batch_size']))
-
-for epoch, batch in tqdm(zip(range(total_ppo_epochs), iter(dataloader))):
-    logs, timing = dict(), dict()
-    t0 = time.time()
-    query_tensors = [torch.tensor(t).long().to(device) for t in batch["tokens"]]
-    
-    #### Get response from gpt2
-    t = time.time()
-    response_tensors = []
-    for i in range(config['batch_size']):
-        gen_len = output_size()
-        response = gpt2_model.generate(query_tensors[i].unsqueeze(dim=0),
-                                       max_new_tokens=gen_len, **gen_kwargs)
-        response_tensors.append(response.squeeze()[-gen_len:])
-    batch['response'] = [gpt2_tokenizer.decode(r.squeeze()) for r in response_tensors]
-    timing['time/get_response'] = time.time()-t
-
-    #### Compute sentiment score
-    t = time.time()
-    texts = [q + r for q,r in zip(batch['query'], batch['response'])]
-    pipe_outputs = sentiment_pipe(texts, **sent_kwargs)
-    rewards = torch.tensor([output[1]["score"] for output in pipe_outputs]).to(device)
-    timing['time/get_sentiment_preds'] = time.time()-t
-    
-    #### Run PPO step 
-    t = time.time()
-    stats = ppo_trainer.step(query_tensors, response_tensors, rewards)
-    timing['time/optimization'] = time.time()-t
-     
-    #### Log everything
-    timing['time/epoch'] = time.time()-t0
-    table_rows = [list(r) for r in zip(batch['query'], batch['response'], rewards.cpu().tolist())]
-    logs.update({'game_log': wandb.Table(columns=['query', 'response', 'reward'], rows=table_rows)})
-    logs.update(timing)
-    logs.update(stats)
-    logs['env/reward_mean'] = torch.mean(rewards).cpu().numpy()
-    logs['env/reward_std'] = torch.std(rewards).cpu().numpy()
-    logs['env/reward_dist'] = rewards.cpu().numpy()
-    wandb.log(logs)
-```
-
-### Training progress
-If you are tracking the training progress with Weights&Biases you should see a plot similar to the one below. Check out the interactive sample report on wandb.ai: [link](https://app.wandb.ai/lvwerra/trl-showcase/runs/1jtvxb1m/).
-
-Reward mean and distribution evolution during training:
-<img src='https://huggingface.co/datasets/trl-internal-testing/example-images/resolve/main/images/gpt2_tuning_progress.png' width='800'/>
-
-One can observe how the model starts to generate more positive outputs after a few optimisation steps.
-
-> Note: Investigating the KL-divergence will probably show that at this point the model has not converged to the target KL-divergence, yet. To get there would require longer training or starting with a higher inital coefficient.
-
-## Model inspection
-Let's inspect some examples from the IMDB dataset. We can use `gpt2_model_ref` to compare the tuned model `gpt2_model` against the model before optimisation.
-
-
-```python
-#### get a batch from the dataset
-bs = 16
-game_data = dict()
-ds.set_format("pandas")
-df_batch = ds[:].sample(bs)
-game_data['query'] = df_batch['query'].tolist()
-query_tensors = df_batch['tokens'].tolist()
-
-response_tensors_ref, response_tensors = [], []
-
-#### get response from gpt2 and gpt2_ref
-for i in range(bs):
-    gen_len = output_size()
-    output = gpt2_model_ref.generate(torch.tensor(query_tensors[i]).unsqueeze(dim=0).to(device),
-                                     max_new_tokens=gen_len, **gen_kwargs).squeeze()[-gen_len:]
-    response_tensors_ref.append(output)
-    output = gpt2_model.generate(torch.tensor(query_tensors[i]).unsqueeze(dim=0).to(device),
-                                 max_new_tokens=gen_len, **gen_kwargs).squeeze()[-gen_len:]
-    response_tensors.append(output)
-
-#### decode responses
-game_data['response (before)'] = [gpt2_tokenizer.decode(response_tensors_ref[i]) for i in range(bs)]
-game_data['response (after)'] = [gpt2_tokenizer.decode(response_tensors[i]) for i in range(bs)]
-
-#### sentiment analysis of query/response pairs before/after
-texts = [q + r for q,r in zip(game_data['query'], game_data['response (before)'])]
-game_data['rewards (before)'] = [output[1]["score"] for output in sentiment_pipe(texts, **sent_kwargs)]
-
-texts = [q + r for q,r in zip(game_data['query'], game_data['response (after)'])]
-game_data['rewards (after)'] = [output[1]["score"] for output in sentiment_pipe(texts, **sent_kwargs)]
-
-# store results in a dataframe
-df_results = pd.DataFrame(game_data)
-df_results
-```
-
-
-Looking at the reward mean/median of the generated sequences we observe a significant difference.
-
-
-```python
-print('mean:')
-display(df_results[["rewards (before)", "rewards (after)"]].mean())
-print()
-print('median:')
-display(df_results[["rewards (before)", "rewards (after)"]].median())
-```
-
-    mean:
-    rewards (before)    0.156629
-    rewards (after)     1.686487
-    
-    median:
-    rewards (before)   -0.547091
-    rewards (after)     2.479868
-
-
-## Save model
-Finally, we save the model and push it to the Hugging Face for later usage. Before we can push the model to the hub we need to make sure we logged in:
+The `trl` library is powered by `accelerate`. As such it is best to configure and launch trainings with the following commands:
 
 ```bash
-huggingface-cli login
-```
-
-```python
-gpt2_model.save_pretrained('gpt2-imdb-pos-v2', push_to_hub=True)
-gpt2_tokenizer.save_pretrained('gpt2-imdb-pos-v2', push_to_hub=True)
-```
-
+accelerate config # will prompt you to define the training configuration
+accelerate launch scripts/gpt2-sentiment.py # launches training
+```
\ No newline at end of file
diff --git a/examples/README.md b/examples/README.md
new file mode 100644
index 0000000000..ec2eac541f
--- /dev/null
+++ b/examples/README.md
@@ -0,0 +1,33 @@
+# Sentiment Examples
+
+The notebooks and scripts in this examples show how to fine-tune a model with a sentiment classifier (such as `lvwerra/distilbert-imdb`).
+
+Here's an overview of the notebooks and scripts:
+
+| File | Description |
+|---|---|
+| `notebooks/gpt2-sentiment.ipynb`  | Fine-tune GPT2 to generate positive movie reviews. |
+| `notebooks/gpt2-sentiment-control.ipynb`  | Fine-tune GPT2 to generate movie reviews with controlled sentiment. |
+| `scripts/gpt2-sentiment.py` | Same as the notebook, but easier to use to use in mutli-GPU setup. |
+| `scripts/t5-sentiment.py` | Same as GPT2 script, but for a Seq2Seq model (T5). |
+
+
+## Installation
+
+```bash
+pip install trl
+#optional: wandb
+pip install wandb
+```
+
+Note: if you don't want to log with `wandb` remove `log_with="wandb"` in the scripts/notebooks. You can also replace it with your favourite experiment tracker that's [supported by `accelerate`](https://huggingface.co/docs/accelerate/usage_guides/tracking).
+
+
+## Launch scripts
+
+The `trl` library is powered by `accelerate`. As such it is best to configure and launch trainings with the following commands:
+
+```bash
+accelerate config # will prompt you to define the training configuration
+accelerate launch scripts/gpt2-sentiment.py # launches training
+```
\ No newline at end of file
diff --git a/examples/notebooks/distilbert-imdb-training.ipynb b/examples/notebooks/distilbert-imdb-training.ipynb
deleted file mode 100644
index 302f538d82..0000000000
--- a/examples/notebooks/distilbert-imdb-training.ipynb
+++ /dev/null
@@ -1,292 +0,0 @@
-{
- "cells": [
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "# Train IMDb Classifier\n",
-    "> Train a IMDb classifier with DistilBERT."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "!huggingface-cli login"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## Load IMDb dataset"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "from datasets import load_dataset, load_metric"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "ds = load_dataset(\"imdb\")"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "DatasetDict({\n",
-       "    train: Dataset({\n",
-       "        features: ['text', 'label'],\n",
-       "        num_rows: 25000\n",
-       "    })\n",
-       "    test: Dataset({\n",
-       "        features: ['text', 'label'],\n",
-       "        num_rows: 25000\n",
-       "    })\n",
-       "    unsupervised: Dataset({\n",
-       "        features: ['text', 'label'],\n",
-       "        num_rows: 50000\n",
-       "    })\n",
-       "})"
-      ]
-     },
-     "execution_count": null,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "ds"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "{'label': ClassLabel(num_classes=2, names=['neg', 'pos'], names_file=None, id=None),\n",
-       " 'text': Value(dtype='string', id=None)}"
-      ]
-     },
-     "execution_count": null,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "ds['train'].features"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## Load Pretrained DistilBERT"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "from transformers import AutoModelForSequenceClassification, AutoTokenizer\n",
-    "\n",
-    "model_name = \"distilbert-base-uncased\"\n",
-    "model = AutoModelForSequenceClassification.from_pretrained(model_name)\n",
-    "tokenizer = AutoTokenizer.from_pretrained(model_name)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## Prepocess Data"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "application/vnd.jupyter.widget-view+json": {
-       "model_id": "6ddef2e0d4a04e12ad7513950158236c",
-       "version_major": 2,
-       "version_minor": 0
-      },
-      "text/plain": [
-       "  0%|          | 0/25 [00:00<?, ?ba/s]"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    },
-    {
-     "data": {
-      "application/vnd.jupyter.widget-view+json": {
-       "model_id": "4b1392a042614a1682b6f62642262446",
-       "version_major": 2,
-       "version_minor": 0
-      },
-      "text/plain": [
-       "  0%|          | 0/25 [00:00<?, ?ba/s]"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    },
-    {
-     "data": {
-      "application/vnd.jupyter.widget-view+json": {
-       "model_id": "a7f130baafab4493bfe185fa7f3a9fe9",
-       "version_major": 2,
-       "version_minor": 0
-      },
-      "text/plain": [
-       "  0%|          | 0/50 [00:00<?, ?ba/s]"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    }
-   ],
-   "source": [
-    "def tokenize(examples):\n",
-    "    outputs = tokenizer(examples['text'], truncation=True)\n",
-    "    return outputs\n",
-    "\n",
-    "tokenized_ds = ds.map(tokenize, batched=True)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## Prepare Trainer"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "from transformers import TrainingArguments, Trainer, DataCollatorWithPadding"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "import numpy as np\n",
-    "\n",
-    "def compute_metrics(eval_preds):\n",
-    "    metric = load_metric(\"accuracy\")\n",
-    "    logits, labels = eval_preds\n",
-    "    predictions = np.argmax(logits, axis=-1)\n",
-    "    return metric.compute(predictions=predictions, references=labels)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "training_args = TrainingArguments(num_train_epochs=1,\n",
-    "                                  output_dir=\"distilbert-imdb\",\n",
-    "                                  push_to_hub=True,\n",
-    "                                  per_device_train_batch_size=16,\n",
-    "                                  per_device_eval_batch_size=16,\n",
-    "                                  evaluation_strategy=\"epoch\")"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "data_collator = DataCollatorWithPadding(tokenizer)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "trainer = Trainer(model=model, tokenizer=tokenizer,\n",
-    "                  data_collator=data_collator,\n",
-    "                  args=training_args,\n",
-    "                  train_dataset=tokenized_ds[\"train\"],\n",
-    "                  eval_dataset=tokenized_ds[\"test\"], \n",
-    "                  compute_metrics=compute_metrics)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## Train Model and Push to Hub"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "trainer.train()"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "trainer.push_to_hub()"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": []
-  }
- ],
- "metadata": {
-  "kernelspec": {
-   "display_name": "Python 3 (ipykernel)",
-   "language": "python",
-   "name": "python3"
-  }
- },
- "nbformat": 4,
- "nbformat_minor": 4
-}
diff --git a/examples/scripts/ppo-sentiment-adam-8bit.py b/examples/scripts/ppo-sentiment-adam-8bit.py
deleted file mode 100644
index 2aeae0f4a7..0000000000
--- a/examples/scripts/ppo-sentiment-adam-8bit.py
+++ /dev/null
@@ -1,153 +0,0 @@
-# coding=utf-8
-# Copyright 2022 The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import torch
-from tqdm import tqdm
-tqdm.pandas()
-
-from transformers import pipeline, AutoTokenizer
-from datasets import load_dataset
-
-import bitsandbytes as bnb
-
-from trl import PPOTrainer, PPOConfig, AutoModelForCausalLMWithValueHead
-from trl.core import LengthSampler
-
-########################################################################
-# This is a fully working simple example to use trl with accelerate.
-#
-# This example fine-tunes a GPT2 model on the IMDB dataset using PPO 
-# (proximal policy optimization).
-# in any of the following settings (with the same script):
-#   - single CPU or single GPU
-#   - multi GPUS (using PyTorch distributed mode)
-#   - multi GPUS (using DeepSpeed ZeRO-Offload stages 1 & 2)
-#   - fp16 (mixed-precision) or fp32 (normal precision)
-#
-# To run it in each of these various modes, first initialize the accelerate
-# configuration with `accelerate config`
-#
-########################################################################
-
-# We first define the configuration of the experiment, defining the model, the dataset,
-# the training parameters, and the PPO parameters.
-# Check the default arguments in the `PPOConfig` class for more details.
-config = PPOConfig(
-    model_name="lvwerra/gpt2-imdb",
-    learning_rate=1.41e-6,
-)
-
-# We then define the arguments to pass to the sentiment analysis pipeline.
-# We set `return_all_scores` to True to get the sentiment score for each token.
-sent_kwargs = {
-    "return_all_scores": True,
-    "function_to_apply": "none",
-    "batch_size": config.forward_batch_size
-}
-
-# Below is an example function to build the dataset. In our case, we use the IMDB dataset
-# from the `datasets` library. One should customize this function to train the model on
-# its own dataset.
-def build_dataset(config, dataset_name="imdb", input_min_text_length=2, input_max_text_length=8):
-    """
-    Build dataset for training. This builds the dataset from `load_dataset`, one should 
-    customize this function to train the model on its own dataset.
-    
-    Args:
-        dataset_name (`str`): 
-            The name of the dataset to be loaded.
-    
-    Returns:
-        dataloader (`torch.utils.data.DataLoader`):
-            The dataloader for the dataset.
-    """
-    tokenizer = AutoTokenizer.from_pretrained(config.model_name)
-    tokenizer.pad_token = tokenizer.eos_token
-    # load imdb with datasets
-    ds = load_dataset(dataset_name, split='train')
-    ds = ds.rename_columns({'text': 'review'})
-    ds = ds.filter(lambda x: len(x["review"])>200, batched=False)
-
-    input_size = LengthSampler(input_min_text_length, input_max_text_length)
-
-    def tokenize(sample):
-        sample["input_ids"] = tokenizer.encode(sample["review"])[:input_size()]
-        sample["query"] = tokenizer.decode(sample["input_ids"])
-        return sample
-
-    ds = ds.map(tokenize, batched=False)
-    ds.set_format(type='torch')
-    return ds
-
-# We retrieve the dataloader by calling the `build_dataset` function.
-dataset = build_dataset(config)
-
-def collator(data):
-    return dict((key, [d[key] for d in data]) for key in data[0])
-
-# Now let's build the model, the reference model, and the tokenizer.
-model = AutoModelForCausalLMWithValueHead.from_pretrained(config.model_name)
-ref_model = AutoModelForCausalLMWithValueHead.from_pretrained(config.model_name)
-tokenizer = AutoTokenizer.from_pretrained(config.model_name)
-optimizer = bnb.optim.Adam8bit(model.parameters(), lr=config.learning_rate)
-
-# GPT-2 tokenizer has a pad token, but it is not eos_token by default. We need to set it to eos_token.
-# only for this model.
-tokenizer.pad_token = tokenizer.eos_token
-
-# We then build the PPOTrainer, passing the model, the reference model, the tokenizer
-ppo_trainer = PPOTrainer(config, model, ref_model, tokenizer, dataset=dataset, data_collator=collator, optimizer=optimizer)
-
-# We then build the sentiment analysis pipeline, passing the model name and the
-# sentiment analysis pipeline arguments. Let's also make sure to set the device
-# to the same device as the PPOTrainer.
-device = ppo_trainer.accelerator.device
-if ppo_trainer.accelerator.num_processes == 1:
-   device = 0 if torch.cuda.is_available() else "cpu" # to avoid a `pipeline` bug
-sentiment_pipe = pipeline("sentiment-analysis", model="lvwerra/distilbert-imdb", device=device)
-
-# We then define the arguments to pass to the `generate` function. These arguments
-# are passed to the `generate` function of the PPOTrainer, which is a wrapper around 
-# the `generate` function of the trained model.
-generation_kwargs = {
-    "min_length":-1,
-    "top_k": 0.0,
-    "top_p": 1.0,
-    "do_sample": True,
-    "pad_token_id": tokenizer.eos_token_id
-}
-output_min_length = 4
-output_max_length = 16
-output_length_sampler = LengthSampler(output_min_length, output_max_length)
-
-for epoch, batch in tqdm(enumerate(ppo_trainer.dataloader)):
-    query_tensors = batch['input_ids']
-
-    #### Get response from gpt2
-    response_tensors = []
-    for query in query_tensors:
-        gen_len = output_length_sampler()
-        generation_kwargs["max_new_tokens"] = gen_len
-        response = ppo_trainer.generate(query, **generation_kwargs)
-        response_tensors.append(response.squeeze()[-gen_len:])
-    batch['response'] = [tokenizer.decode(r.squeeze()) for r in response_tensors]
-
-    #### Compute sentiment score
-    texts = [q + r for q,r in zip(batch['query'], batch['response'])]
-    pipe_outputs = sentiment_pipe(texts, **sent_kwargs)
-    rewards = [torch.tensor(output[1]["score"]).to(device) for output in pipe_outputs]
-
-    #### Run PPO step 
-    stats = ppo_trainer.step(query_tensors, response_tensors, rewards)
-    ppo_trainer.log_stats(stats, batch, rewards)
\ No newline at end of file
diff --git a/examples/notebooks/gpt2-sentiment-control.ipynb b/examples/sentiment/notebooks/gpt2-sentiment-control.ipynb
similarity index 100%
rename from examples/notebooks/gpt2-sentiment-control.ipynb
rename to examples/sentiment/notebooks/gpt2-sentiment-control.ipynb
diff --git a/examples/notebooks/gpt2-sentiment-ppo-training.ipynb b/examples/sentiment/notebooks/gpt2-sentiment.ipynb
similarity index 69%
rename from examples/notebooks/gpt2-sentiment-ppo-training.ipynb
rename to examples/sentiment/notebooks/gpt2-sentiment.ipynb
index ad5bc85381..d999961f40 100644
--- a/examples/notebooks/gpt2-sentiment-ppo-training.ipynb
+++ b/examples/sentiment/notebooks/gpt2-sentiment.ipynb
@@ -38,11 +38,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {
-    "vscode": {
-     "languageId": "python"
-    }
-   },
+   "metadata": {},
    "outputs": [],
    "source": [
     "%load_ext autoreload\n",
@@ -52,29 +48,18 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {
-    "vscode": {
-     "languageId": "python"
-    }
-   },
+   "metadata": {},
    "outputs": [],
    "source": [
     "import torch\n",
-    "import wandb\n",
-    "import time\n",
-    "import os\n",
     "from tqdm import tqdm\n",
-    "import numpy as np\n",
-    "import pandas as pd\n",
     "tqdm.pandas()\n",
     "\n",
+    "from transformers import pipeline, AutoTokenizer\n",
     "from datasets import load_dataset\n",
     "\n",
-    "from transformers import AutoTokenizer, pipeline\n",
-    "\n",
-    "from trl.gpt2 import GPT2HeadWithValueModel, respond_to_batch\n",
-    "from trl.ppo import PPOTrainer\n",
-    "from trl.core import build_bert_batch_from_txt"
+    "from trl import PPOTrainer, PPOConfig, AutoModelForCausalLMWithValueHead\n",
+    "from trl.core import LengthSampler"
    ]
   },
   {
@@ -87,33 +72,19 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {
-    "vscode": {
-     "languageId": "python"
-    }
-   },
+   "metadata": {},
    "outputs": [],
    "source": [
-    "config = {\n",
-    "    \"model_name\": \"lvwerra/gpt2-imdb\",\n",
-    "    \"cls_model_name\": \"lvwerra/distilbert-imdb\",\n",
-    "    \"steps\": 20000,\n",
-    "    \"batch_size\": 256,\n",
-    "    \"forward_batch_size\": 16,\n",
-    "    \"ppo_epochs\": 4,   \n",
-    "    \"txt_in_min_len\": 2,\n",
-    "    \"txt_in_max_len\": 8,\n",
-    "    \"txt_out_min_len\": 4,\n",
-    "    \"txt_out_max_len\": 16,\n",
-    "    \"lr\": 1.41e-5,\n",
-    "    \"init_kl_coef\":0.2,\n",
-    "    \"target\": 6,\n",
-    "    \"horizon\":10000,\n",
-    "    \"gamma\":1,\n",
-    "    \"lam\":0.95,\n",
-    "    \"cliprange\": .2,\n",
-    "    \"cliprange_value\":.2,\n",
-    "    \"vf_coef\":.1, \n",
+    "config = PPOConfig(\n",
+    "    model_name=\"lvwerra/gpt2-imdb\",\n",
+    "    learning_rate=1.41e-5,\n",
+    "    log_with=\"wandb\",\n",
+    ")\n",
+    "\n",
+    "sent_kwargs = {\n",
+    "    \"return_all_scores\": True,\n",
+    "    \"function_to_apply\": \"none\",\n",
+    "    \"batch_size\": config.forward_batch_size\n",
     "}"
    ]
   },
@@ -124,20 +95,6 @@
     "**Forward batching**: Since the models can be fairly big and we want to rollout large PPO batches this can lead to out-of-memory errors when doing the forward passes for text generation and sentiment analysis. We introduce the parameter `forward_batch_size` to split the forward passes into smaller batches. Although this hurts performance a little this is neglectible compared to the computations of the backward passes when optimizing the model. The same parameter is used in the `PPOTrainer` when doing forward passes. The `batch_size` should multiple of `forward_batch_size`."
    ]
   },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "vscode": {
-     "languageId": "python"
-    }
-   },
-   "outputs": [],
-   "source": [
-    "device = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")\n",
-    "pipe_device = 0 if torch.cuda.is_available() else -1"
-   ]
-  },
   {
    "cell_type": "markdown",
    "metadata": {},
@@ -146,65 +103,6 @@
     "https://arxiv.org/pdf/1909.08593.pdf). This model as well as the BERT model is available in the Huggingface model zoo [here](https://huggingface.co/models). The following code should automatically download the models."
    ]
   },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "### Initialize W&B logger\n",
-    "We use `wandb`to log all the metrics during training."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "vscode": {
-     "languageId": "python"
-    }
-   },
-   "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "\u001b[34m\u001b[1mwandb\u001b[0m: Currently logged in as: \u001b[33mlvwerra\u001b[0m (use `wandb login --relogin` to force relogin)\n",
-      "\u001b[34m\u001b[1mwandb\u001b[0m: wandb version 0.12.16 is available!  To upgrade, please run:\n",
-      "\u001b[34m\u001b[1mwandb\u001b[0m:  $ pip install wandb --upgrade\n"
-     ]
-    },
-    {
-     "data": {
-      "text/html": [
-       "\n",
-       "                    Syncing run <strong><a href=\"https://wandb.ai/lvwerra/gpt2-test/runs/30spsy9r\" target=\"_blank\">run-42</a></strong> to <a href=\"https://wandb.ai/lvwerra/gpt2-test\" target=\"_blank\">Weights & Biases</a> (<a href=\"https://docs.wandb.com/integrations/jupyter.html\" target=\"_blank\">docs</a>).<br/>\n",
-       "\n",
-       "                "
-      ],
-      "text/plain": [
-       "<IPython.core.display.HTML object>"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    },
-    {
-     "data": {
-      "text/html": [
-       "<button onClick=\"this.nextSibling.style.display='block';this.style.display='none';\">Display W&B run</button><iframe src=\"https://wandb.ai/lvwerra/gpt2-test/runs/30spsy9r?jupyter=true\" style=\"border:none;width:100%;height:420px;display:none;\"></iframe>"
-      ],
-      "text/plain": [
-       "<wandb.sdk.wandb_run.Run at 0x7f26e0fe3b80>"
-      ]
-     },
-     "execution_count": null,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "wandb.init(name='run-42', project='gpt2-test', config=config, )"
-   ]
-  },
   {
    "cell_type": "markdown",
    "metadata": {},
@@ -213,21 +111,18 @@
    ]
   },
   {
+   "attachments": {},
    "cell_type": "markdown",
    "metadata": {},
    "source": [
     "### Load IMDB dataset\n",
-    "The IMDB dataset contains 50k movie review annotated with \"positive\"/\"negative\" feedback indicating the sentiment.  We load the IMDB dataset into a DataFrame and filter for comments that are at least 500 characters long and take the first 1000 characters of each comment. The first filter we apply to avoid comments that are less than `txt_in_len` token long and the second to avoid tokenizing way more text than we actually need."
+    "The IMDB dataset contains 50k movie review annotated with \"positive\"/\"negative\" feedback indicating the sentiment.  We load the IMDB dataset into a DataFrame and filter for comments that are at least 200 characters. Then we tokenize each text and cut it to random size with the `LengthSampler`."
    ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {
-    "vscode": {
-     "languageId": "python"
-    }
-   },
+   "metadata": {},
    "outputs": [
     {
      "name": "stderr",
@@ -239,184 +134,132 @@
     }
    ],
    "source": [
-    "# load imdb with datasets\n",
-    "ds = load_dataset('imdb', split='train')\n",
-    "ds = ds.rename_columns({'text': 'review', 'label': 'sentiment'})\n",
-    "ds = ds.filter(lambda x: len(x[\"review\"])>200, batched=False)"
+    "def build_dataset(config, dataset_name=\"imdb\", input_min_text_length=2, input_max_text_length=8):\n",
+    "    \"\"\"\n",
+    "    Build dataset for training. This builds the dataset from `load_dataset`, one should \n",
+    "    customize this function to train the model on its own dataset.\n",
+    "    \n",
+    "    Args:\n",
+    "        dataset_name (`str`): \n",
+    "            The name of the dataset to be loaded.\n",
+    "    \n",
+    "    Returns:\n",
+    "        dataloader (`torch.utils.data.DataLoader`):\n",
+    "            The dataloader for the dataset.\n",
+    "    \"\"\"\n",
+    "    tokenizer = AutoTokenizer.from_pretrained(config.model_name)\n",
+    "    tokenizer.pad_token = tokenizer.eos_token\n",
+    "    # load imdb with datasets\n",
+    "    ds = load_dataset(dataset_name, split='train')\n",
+    "    ds = ds.rename_columns({'text': 'review'})\n",
+    "    ds = ds.filter(lambda x: len(x[\"review\"])>200, batched=False)\n",
+    "\n",
+    "    input_size = LengthSampler(input_min_text_length, input_max_text_length)\n",
+    "\n",
+    "    def tokenize(sample):\n",
+    "        sample[\"input_ids\"] = tokenizer.encode(sample[\"review\"])[:input_size()]\n",
+    "        sample[\"query\"] = tokenizer.decode(sample[\"input_ids\"])\n",
+    "        return sample\n",
+    "\n",
+    "    ds = ds.map(tokenize, batched=False)\n",
+    "    ds.set_format(type='torch')\n",
+    "    return ds"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {
-    "vscode": {
-     "languageId": "python"
-    }
-   },
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "Dataset({\n",
-       "    features: ['review', 'sentiment'],\n",
-       "    num_rows: 24895\n",
-       "})"
-      ]
-     },
-     "execution_count": null,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "ds"
-   ]
-  },
-  {
-   "cell_type": "markdown",
    "metadata": {},
-   "source": [
-    "### Load BERT classifier\n",
-    "We load a BERT classifier fine-tuned on the IMDB dataset."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "vscode": {
-     "languageId": "python"
-    }
-   },
    "outputs": [],
    "source": [
-    "sent_kwargs = {\n",
-    "    \"return_all_scores\": True,\n",
-    "    \"function_to_apply\": \"none\",\n",
-    "    \"batch_size\": config[\"forward_batch_size\"]\n",
-    "}\n",
+    "dataset = build_dataset(config)\n",
     "\n",
-    "sentiment_pipe = pipeline(\"sentiment-analysis\",\"lvwerra/distilbert-imdb\", device=pipe_device)"
+    "def collator(data):\n",
+    "    return dict((key, [d[key] for d in data]) for key in data[0])"
    ]
   },
   {
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "The model outputs are the logits for the negative and positive class. We will use the logits for positive class as a reward signal for the language model."
+    "### Load pre-trained GPT2 language models"
    ]
   },
   {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "vscode": {
-     "languageId": "python"
-    }
-   },
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "[[{'label': 'NEGATIVE', 'score': 2.335048198699951},\n",
-       "  {'label': 'POSITIVE', 'score': -2.726576566696167}]]"
-      ]
-     },
-     "execution_count": null,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
+   "cell_type": "markdown",
+   "metadata": {},
    "source": [
-    "text = 'this movie was really bad!!'\n",
-    "sentiment_pipe(text, **sent_kwargs)"
+    "We load the GPT2 model with a value head and the tokenizer. We load the model twice; the first model is optimized while the second model serves as a reference to calculate the KL-divergence from the starting point. This serves as an additional reward signal in the PPO training to make sure the optimized model does not deviate too much from the original language model."
    ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {
-    "vscode": {
-     "languageId": "python"
-    }
-   },
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "[[{'label': 'NEGATIVE', 'score': -2.2947897911071777},\n",
-       "  {'label': 'POSITIVE', 'score': 2.557039737701416}]]"
-      ]
-     },
-     "execution_count": null,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
+   "metadata": {},
+   "outputs": [],
    "source": [
-    "text = 'this movie was really good!!'\n",
-    "sentiment_pipe(text, **sent_kwargs)"
+    "model = AutoModelForCausalLMWithValueHead.from_pretrained(config.model_name)\n",
+    "ref_model = AutoModelForCausalLMWithValueHead.from_pretrained(config.model_name)\n",
+    "tokenizer = AutoTokenizer.from_pretrained(config.model_name)\n",
+    "\n",
+    "tokenizer.pad_token = tokenizer.eos_token"
    ]
   },
   {
+   "attachments": {},
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "The resulting reward signal:"
+    "### Initialize PPOTrainer\n",
+    "The `PPOTrainer` takes care of device placement and optimization later on:"
    ]
   },
   {
-   "cell_type": "markdown",
+   "cell_type": "code",
+   "execution_count": null,
    "metadata": {},
+   "outputs": [],
    "source": [
-    "### Load pre-trained GPT2 language models"
+    "ppo_trainer = PPOTrainer(config, model, ref_model, tokenizer, dataset=dataset, data_collator=collator)"
    ]
   },
   {
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "We load the GPT2 model with a value head and the tokenizer. We load the model twice; the first model is optimized while the second model serves as a reference to calculate the KL-divergence from the starting point. This serves as an additional reward signal in the PPO training to make sure the optimized model does not deviate too much from the original language model."
+    "### Load BERT classifier\n",
+    "We load a BERT classifier fine-tuned on the IMDB dataset."
    ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {
-    "vscode": {
-     "languageId": "python"
-    }
-   },
+   "metadata": {},
    "outputs": [],
    "source": [
-    "gpt2_model = GPT2HeadWithValueModel.from_pretrained(config['model_name'])\n",
-    "gpt2_model_ref = GPT2HeadWithValueModel.from_pretrained(config['model_name'])\n",
-    "\n",
-    "gpt2_tokenizer = AutoTokenizer.from_pretrained(config['model_name'])\n",
-    "gpt2_tokenizer.pad_token = gpt2_tokenizer.eos_token"
+    "device = ppo_trainer.accelerator.device\n",
+    "if ppo_trainer.accelerator.num_processes == 1:\n",
+    "   device = 0 if torch.cuda.is_available() else \"cpu\" # to avoid a `pipeline` bug\n",
+    "sentiment_pipe = pipeline(\"sentiment-analysis\", model=\"lvwerra/distilbert-imdb\", device=device)"
    ]
   },
   {
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "### Watch model with wandb\n",
-    "This wandb magic logs the gradients and weights of the model during training."
+    "The model outputs are the logits for the negative and positive class. We will use the logits for positive class as a reward signal for the language model."
    ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {
-    "vscode": {
-     "languageId": "python"
-    }
-   },
+   "metadata": {},
    "outputs": [
     {
      "data": {
       "text/plain": [
-       "[]"
+       "[[{'label': 'NEGATIVE', 'score': 2.335048198699951},\n",
+       "  {'label': 'POSITIVE', 'score': -2.726576566696167}]]"
       ]
      },
      "execution_count": null,
@@ -425,94 +268,30 @@
     }
    ],
    "source": [
-    "wandb.watch(gpt2_model, log='all')"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "### Move models to GPU"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "If `cuda` is available move the computations to the GPU."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "vscode": {
-     "languageId": "python"
-    }
-   },
-   "outputs": [],
-   "source": [
-    "gpt2_model.to(device);\n",
-    "gpt2_model_ref.to(device);"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "### Tokenize IMDB reviews"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "We want to randomize the query and response length so we introduce a `LengthSampler` that uniformly samples values from an interval."
+    "text = 'this movie was really bad!!'\n",
+    "sentiment_pipe(text, **sent_kwargs)"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {
-    "vscode": {
-     "languageId": "python"
-    }
-   },
-   "outputs": [],
-   "source": [
-    "class LengthSampler:\n",
-    "    def __init__(self, min_value, max_value):\n",
-    "        self.values = list(range(min_value, max_value))\n",
-    "    def __call__(self):\n",
-    "        return np.random.choice(self.values)\n",
-    "    \n",
-    "input_size = LengthSampler(config[\"txt_in_min_len\"], config[\"txt_in_max_len\"])\n",
-    "output_size = LengthSampler(config[\"txt_out_min_len\"], config[\"txt_out_max_len\"])"
-   ]
-  },
-  {
-   "cell_type": "markdown",
    "metadata": {},
-   "source": [
-    "We pre-tokenize all IMDB in advance to avoid tokenizing twice. In the first step we encode the queries and slice the first `input_size()` tokens. In a second step we decode these tokens back to text for later display."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "vscode": {
-     "languageId": "python"
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "[[{'label': 'NEGATIVE', 'score': -2.2947897911071777},\n",
+       "  {'label': 'POSITIVE', 'score': 2.557039737701416}]]"
+      ]
+     },
+     "execution_count": null,
+     "metadata": {},
+     "output_type": "execute_result"
     }
-   },
-   "outputs": [],
+   ],
    "source": [
-    "def tokenize(sample):\n",
-    "    sample[\"tokens\"] = gpt2_tokenizer.encode(sample[\"review\"])[:input_size()]\n",
-    "    sample[\"query\"] = gpt2_tokenizer.decode(sample[\"tokens\"])\n",
-    "    return sample\n",
-    "\n",
-    "ds = ds.map(tokenize, batched=False)"
+    "text = 'this movie was really good!!'\n",
+    "sentiment_pipe(text, **sent_kwargs)"
    ]
   },
   {
@@ -526,11 +305,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {
-    "vscode": {
-     "languageId": "python"
-    }
-   },
+   "metadata": {},
    "outputs": [],
    "source": [
     "gen_kwargs = {\n",
@@ -538,7 +313,7 @@
     "    \"top_k\": 0.0,\n",
     "    \"top_p\": 1.0,\n",
     "    \"do_sample\": True,\n",
-    "    \"pad_token_id\": gpt2_tokenizer.eos_token_id\n",
+    "    \"pad_token_id\": tokenizer.eos_token_id\n",
     "}"
    ]
   },
@@ -549,30 +324,6 @@
     "## Optimize model"
    ]
   },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "### Dataloader\n",
-    "We use a dataloader to return the batches of queries used for each PPO epoch:"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "vscode": {
-     "languageId": "python"
-    }
-   },
-   "outputs": [],
-   "source": [
-    "def collator(data):\n",
-    "    return dict((key, [d[key] for d in data]) for key in data[0])\n",
-    "\n",
-    "dataloader = torch.utils.data.DataLoader(ds, batch_size=config['batch_size'], collate_fn=collator)"
-   ]
-  },
   {
    "cell_type": "markdown",
    "metadata": {},
@@ -597,55 +348,43 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {
-    "vscode": {
-     "languageId": "python"
-    }
-   },
+   "metadata": {},
    "outputs": [],
    "source": [
-    "ppo_trainer = PPOTrainer(gpt2_model, gpt2_model_ref, gpt2_tokenizer, **config)\n",
+    "output_min_length = 4\n",
+    "output_max_length = 16\n",
+    "output_length_sampler = LengthSampler(output_min_length, output_max_length)\n",
     "\n",
-    "total_ppo_epochs = int(np.ceil(config[\"steps\"]/config['batch_size']))\n",
     "\n",
-    "for epoch, batch in tqdm(zip(range(total_ppo_epochs), iter(dataloader))):\n",
-    "    logs, timing = dict(), dict()\n",
-    "    t0 = time.time()\n",
-    "    query_tensors = [torch.tensor(t).long().to(device) for t in batch[\"tokens\"]]\n",
-    "    \n",
+    "generation_kwargs = {\n",
+    "    \"min_length\":-1,\n",
+    "    \"top_k\": 0.0,\n",
+    "    \"top_p\": 1.0,\n",
+    "    \"do_sample\": True,\n",
+    "    \"pad_token_id\": tokenizer.eos_token_id\n",
+    "}\n",
+    "\n",
+    "\n",
+    "for epoch, batch in tqdm(enumerate(ppo_trainer.dataloader)):\n",
+    "    query_tensors = batch['input_ids']\n",
+    "\n",
     "    #### Get response from gpt2\n",
-    "    t = time.time()\n",
     "    response_tensors = []\n",
-    "    for i in range(config['batch_size']):\n",
-    "        gen_len = output_size()\n",
-    "        response = gpt2_model.generate(query_tensors[i].unsqueeze(dim=0),\n",
-    "                                       max_new_tokens=gen_len, **gen_kwargs)\n",
+    "    for query in query_tensors:\n",
+    "        gen_len = output_length_sampler()\n",
+    "        generation_kwargs[\"max_new_tokens\"] = gen_len\n",
+    "        response = ppo_trainer.generate(query, **generation_kwargs)\n",
     "        response_tensors.append(response.squeeze()[-gen_len:])\n",
-    "    batch['response'] = [gpt2_tokenizer.decode(r.squeeze()) for r in response_tensors]\n",
-    "    timing['time/get_response'] = time.time()-t\n",
+    "    batch['response'] = [tokenizer.decode(r.squeeze()) for r in response_tensors]\n",
     "\n",
     "    #### Compute sentiment score\n",
-    "    t = time.time()\n",
     "    texts = [q + r for q,r in zip(batch['query'], batch['response'])]\n",
     "    pipe_outputs = sentiment_pipe(texts, **sent_kwargs)\n",
-    "    rewards = torch.tensor([output[1][\"score\"] for output in pipe_outputs]).to(device)\n",
-    "    timing['time/get_sentiment_preds'] = time.time()-t\n",
-    "    \n",
+    "    rewards = [torch.tensor(output[1][\"score\"]) for output in pipe_outputs]\n",
+    "\n",
     "    #### Run PPO step \n",
-    "    t = time.time()\n",
     "    stats = ppo_trainer.step(query_tensors, response_tensors, rewards)\n",
-    "    timing['time/optimization'] = time.time()-t\n",
-    "     \n",
-    "    #### Log everything\n",
-    "    timing['time/epoch'] = time.time()-t0\n",
-    "    table_rows = [list(r) for r in zip(batch['query'], batch['response'], rewards.cpu().tolist())]\n",
-    "    logs.update({'game_log': wandb.Table(columns=['query', 'response', 'reward'], rows=table_rows)})\n",
-    "    logs.update(timing)\n",
-    "    logs.update(stats)\n",
-    "    logs['env/reward_mean'] = torch.mean(rewards).cpu().numpy()\n",
-    "    logs['env/reward_std'] = torch.std(rewards).cpu().numpy()\n",
-    "    logs['env/reward_dist'] = rewards.cpu().numpy()\n",
-    "    wandb.log(logs)"
+    "    ppo_trainer.log_stats(stats, batch, rewards)"
    ]
   },
   {
@@ -666,21 +405,18 @@
    ]
   },
   {
+   "attachments": {},
    "cell_type": "markdown",
    "metadata": {},
    "source": [
     "## Model inspection\n",
-    "Let's inspect some examples from the IMDB dataset. We can use `gpt2_model_ref` to compare the tuned model `gpt2_model` against the model before optimisation."
+    "Let's inspect some examples from the IMDB dataset. We can use `model_ref` to compare the tuned model `model` against the model before optimisation."
    ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {
-    "vscode": {
-     "languageId": "python"
-    }
-   },
+   "metadata": {},
    "outputs": [
     {
      "name": "stderr",
@@ -934,8 +670,8 @@
     "#### get a batch from the dataset\n",
     "bs = 16\n",
     "game_data = dict()\n",
-    "ds.set_format(\"pandas\")\n",
-    "df_batch = ds[:].sample(bs)\n",
+    "dataset.set_format(\"pandas\")\n",
+    "df_batch = dataset[:].sample(bs)\n",
     "game_data['query'] = df_batch['query'].tolist()\n",
     "query_tensors = df_batch['tokens'].tolist()\n",
     "\n",
@@ -943,17 +679,17 @@
     "\n",
     "#### get response from gpt2 and gpt2_ref\n",
     "for i in range(bs):\n",
-    "    gen_len = output_size()\n",
-    "    output = gpt2_model_ref.generate(torch.tensor(query_tensors[i]).unsqueeze(dim=0).to(device),\n",
+    "    gen_len = output_length_sampler()\n",
+    "    output = ref_model.generate(torch.tensor(query_tensors[i]).unsqueeze(dim=0).to(device),\n",
     "                                     max_new_tokens=gen_len, **gen_kwargs).squeeze()[-gen_len:]\n",
     "    response_tensors_ref.append(output)\n",
-    "    output = gpt2_model.generate(torch.tensor(query_tensors[i]).unsqueeze(dim=0).to(device),\n",
+    "    output = model.generate(torch.tensor(query_tensors[i]).unsqueeze(dim=0).to(device),\n",
     "                                 max_new_tokens=gen_len, **gen_kwargs).squeeze()[-gen_len:]\n",
     "    response_tensors.append(output)\n",
     "\n",
     "#### decode responses\n",
-    "game_data['response (before)'] = [gpt2_tokenizer.decode(response_tensors_ref[i]) for i in range(bs)]\n",
-    "game_data['response (after)'] = [gpt2_tokenizer.decode(response_tensors[i]) for i in range(bs)]\n",
+    "game_data['response (before)'] = [tokenizer.decode(response_tensors_ref[i]) for i in range(bs)]\n",
+    "game_data['response (after)'] = [tokenizer.decode(response_tensors[i]) for i in range(bs)]\n",
     "\n",
     "#### sentiment analysis of query/response pairs before/after\n",
     "texts = [q + r for q,r in zip(game_data['query'], game_data['response (before)'])]\n",
@@ -977,11 +713,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {
-    "vscode": {
-     "languageId": "python"
-    }
-   },
+   "metadata": {},
    "outputs": [
     {
      "name": "stdout",
@@ -1040,11 +772,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {
-    "vscode": {
-     "languageId": "python"
-    }
-   },
+   "metadata": {},
    "outputs": [
     {
      "name": "stderr",
@@ -1102,25 +830,21 @@
     }
    ],
    "source": [
-    "gpt2_model.save_pretrained('gpt2-imdb-pos-v2', push_to_hub=True)\n",
-    "gpt2_tokenizer.save_pretrained('gpt2-imdb-pos-v2', push_to_hub=True)"
+    "model.save_pretrained('gpt2-imdb-pos-v2', push_to_hub=True)\n",
+    "tokenizer.save_pretrained('gpt2-imdb-pos-v2', push_to_hub=True)"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {
-    "vscode": {
-     "languageId": "python"
-    }
-   },
+   "metadata": {},
    "outputs": [],
    "source": []
   }
  ],
  "metadata": {
   "kernelspec": {
-   "display_name": "Python 3 (ipykernel)",
+   "display_name": "env",
    "language": "python",
    "name": "python3"
   },
@@ -1134,7 +858,12 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.9.12"
+   "version": "3.9.12 (main, Mar 26 2022, 15:51:15) \n[Clang 13.1.6 (clang-1316.0.21.2)]"
+  },
+  "vscode": {
+   "interpreter": {
+    "hash": "4c8ff454cd947027f86954d72bf940c689a97dcc494eb53cfe4813862c6065fe"
+   }
   }
  },
  "nbformat": 4,
diff --git a/examples/scripts/ppo-sentiment.py b/examples/sentiment/scripts/gpt2-sentiment.py
similarity index 81%
rename from examples/scripts/ppo-sentiment.py
rename to examples/sentiment/scripts/gpt2-sentiment.py
index b5a3560270..7fc859c1fb 100644
--- a/examples/scripts/ppo-sentiment.py
+++ b/examples/sentiment/scripts/gpt2-sentiment.py
@@ -59,36 +59,7 @@
 # Below is an example function to build the dataset. In our case, we use the IMDB dataset
 # from the `datasets` library. One should customize this function to train the model on
 # its own dataset.
-def build_dataset(config, dataset_name="imdb", input_min_text_length=2, input_max_text_length=8):
-    """
-    Build dataset for training. This builds the dataset from `load_dataset`, one should 
-    customize this function to train the model on its own dataset.
-    
-    Args:
-        dataset_name (`str`): 
-            The name of the dataset to be loaded.
-    
-    Returns:
-        dataloader (`torch.utils.data.DataLoader`):
-            The dataloader for the dataset.
-    """
-    tokenizer = AutoTokenizer.from_pretrained(config.model_name)
-    tokenizer.pad_token = tokenizer.eos_token
-    # load imdb with datasets
-    ds = load_dataset(dataset_name, split='train')
-    ds = ds.rename_columns({'text': 'review'})
-    ds = ds.filter(lambda x: len(x["review"])>200, batched=False)
-
-    input_size = LengthSampler(input_min_text_length, input_max_text_length)
-
-    def tokenize(sample):
-        sample["input_ids"] = tokenizer.encode(sample["review"])[:input_size()]
-        sample["query"] = tokenizer.decode(sample["input_ids"])
-        return sample
-
-    ds = ds.map(tokenize, batched=False)
-    ds.set_format(type='torch')
-    return ds
+
 
 # We retrieve the dataloader by calling the `build_dataset` function.
 dataset = build_dataset(config)
diff --git a/examples/scripts/ppo-sentiment-t5-small.py b/examples/sentiment/scripts/t5-sentiment.py
similarity index 100%
rename from examples/scripts/ppo-sentiment-t5-small.py
rename to examples/sentiment/scripts/t5-sentiment.py