instructor-ai · jxnl · Nov 12, 2023 · Sep 22, 2023 · Nov 2, 2023 · Nov 3, 2023
diff --git a/docs/blog/posts/chain-of-density.md b/docs/blog/posts/chain-of-density.md
diff --git a/docs/blog/posts/img/chain-of-density.png b/docs/blog/posts/img/chain-of-density.png
diff --git a/examples/chain-of-density/Readme.md b/examples/chain-of-density/Readme.md
@@ -0,0 +1,31 @@
+# Introduction
+
+This is a simple example which shows how to perform Chain Of Density summarization using GPT-3.5 and utilise the generated output to fine-tune a 3.5 model for production usage. All of our data referenced in this file is located [here](https://huggingface.co/datasets/ivanleomk/gpt4-chain-of-density) on hugging face
+
+Check out our blog post [here](https://jxnl.github.io/instructor/blog/2023/11/05/implementing-chain-of-density/) where we have a detailed explanation of the code and a [colab notebook](https://colab.research.google.com/drive/1iBkrEh2G5U8yh8RmI8EkWxjLq6zIIuVm?usp=sharing) walking you through how we perform our calculations.
+
+## Instructions
+
+1. First, install all of the required dependencies by running the command below. We recommend using a virtual environment to install these so that it does not affect your system installation.
+
+> We use NLTK to ensure that our summaries are of a certain token length. In order to do so, you'll need to download the `punkt` package to compute the token metrics. You can do so by running the command `nltk.download('punkt')`
+
+```
+pip3 install -r requirements.txt
+```
+
+2. Download the `test.csv` file and the `summarization.jsonl` file that you want to use for finetuning. We provide one with `20` examples, `50` examples and `100` examples to be used for testing. Let's now run a simple finetuning job with the following command.
+
+> Don't forget to set your `OPENAI_API_KEY` as an environment variable in your shell before running these commands
+
+```
+instructor jobs create-from-file summarization.jsonl 
-instructor jobs create-from-file summarization.jsonl 
+instruct jobs create-from-file summarization.jsonl
-instructor jobs create-from-file summarization.jsonl 
+instruct jobs create-from-file summarization.jsonl
+```
+
+3. Once the job is complete, you'll end up with a new GPT 3.5 model that's capable of producing high quality summaries with a high entity density. You can run it by simply changing our `finetune.py` file's `instructions.distil` annotator as
+
+```
+@instructions.distil(model=<your finetuned model >,mode="dispatch")
+def distil_summarization(text: str) -> GeneratedSummary:
+// rest of code goes here
+```
diff --git a/examples/chain-of-density/chain_of_density.py b/examples/chain-of-density/chain_of_density.py
@@ -0,0 +1,144 @@
+from pydantic import BaseModel, Field, field_validator
+from typing import List
+import instructor
+import nltk
+from openai import OpenAI
+
+client = instructor.patch(OpenAI())
+
+
+class InitialSummary(BaseModel):
+    """
+    This is an initial summary which should be long ( 4-5 sentences, ~80 words) yet highly non-specific, containing little information beyond the entities marked as missing. Use overly verbose languages and fillers (Eg. This article discusses) to reach ~80 words.
+    """
+
+    summary: str = Field(
+        ...,
+        description="This is a summary of the article provided which is overly verbose and uses fillers. It should be roughly 80 words in length",
+    )
+
+
+class RewrittenSummary(BaseModel):
+    """
+    This is a new, denser summary of identical length which covers every entity and detail from the previous summary plus the Missing Entities.
+
+    Guidelines
+    - Make every word count : Rewrite the previous summary to improve flow and make space for additional entities
+    - Never drop entities from the previous summary. If space cannot be made, add fewer new entities.
+    - The new summary should be highly dense and concise yet self-contained, eg., easily understood without the Article.
+    - Make space with fusion, compression, and removal of uninformative phrases like "the article discusses"
+    - Missing entities can appear anywhere in the new summary
+
+    An Entity is a real-world object that's assigned a name - for example, a person, country a product or a book title.
+    """
+
+    summary: str = Field(
+        ...,
+        description="This is a new, denser summary of identical length which covers every entity and detail from the previous summary plus the Missing Entities. It should have the same length ( ~ 80 words ) as the previous summary and should be easily understood without the Article",
+    )
+    absent: List[str] = Field(
+        ...,
+        default_factory=list,
+        description="this is a list of Entities found absent from the new summary that were present in the previous summary",
+    )
+    missing: List[str] = Field(
+        default_factory=list,
+        description="This is a list of 1-3 informative Entities from the Article that are missing from the new summary which should be included in the next generated summary.",
+    )
+
+    @field_validator("summary")
+    def min_length(cls, v: str):
+        tokens = nltk.word_tokenize(v)
+        num_tokens = len(tokens)
+        if num_tokens < 75:
+            raise ValueError(
+                "The current summary is too short. Please make sure that you generate a new summary that is around 80 words long."
+            )
+        return v
+
+    @field_validator("missing")
+    def has_missing_entities(cls, missing_entities: List[str]):
+        if len(missing_entities) == 0:
+            raise ValueError(
+                "You must identify 1-3 informative Entities from the Article which are missing from the previously generated summary to be used in a new summary"
+            )
+        return missing_entities
+
+    @field_validator("absent")
+    def has_no_absent_entities(cls, absent_entities: List[str]):
+        absent_entity_string = ",".join(absent_entities)
+        if len(absent_entities) > 0:
+            print(f"Detected absent entities of {absent_entity_string}")
+            raise ValueError(
+                f"Do not omit the following Entities {absent_entity_string} from the new summary"
+            )
+        return absent_entities
+
+
+def summarize_article(article: str, summary_steps: int = 3):
+    summary_chain = []
+    # We first generate an initial summary
+    summary: InitialSummary = client.chat.completions.create(
+        model="gpt-4-0613",
+        response_model=InitialSummary,
+        messages=[
+            {
+                "role": "system",
+                "content": "Write a summary about the article that is long (4-5 sentences) yet highly non-specific. Use overly, verbose language and fillers(eg.,'this article discusses') to reach ~80 words",
+            },
+            {"role": "user", "content": f"Here is the Article: {article}"},
+            {
+                "role": "user",
+                "content": "The generated summary should be about 80 words.",
+            },
+        ],
+        max_retries=2,
+    )
+    prev_summary = None
+    summary_chain.append(summary.summary)
+    for i in range(summary_steps):
+        missing_entity_message = (
+            []
+            if prev_summary is None
+            else [
+                {
+                    "role": "user",
+                    "content": f"Please include these Missing Entities: {','.join(prev_summary.missing)}",
+                },
+            ]
+        )
+        new_summary: RewrittenSummary = client.chat.completions.create(
+            model="gpt-4-0613",
+            messages=[
+                {
+                    "role": "system",
+                    "content": """
+                You are going to generate an increasingly concise,entity-dense summary of the following article.
+
+                Perform the following two tasks
+                - Identify 1-3 informative entities from the following article which is missing from the previous summary
+                - Write a new denser summary of identical length which covers every entity and detail from the previous summary plus the Missing Entities 
+
+                Guidelines
+                - Make every word count: re-write the previous summary to improve flow and make space for additional entities
+                - Make space with fusion, compression, and removal of uninformative phrases like "the article discusses".
+                - The summaries should become highly dense and concise yet self-contained, e.g., easily understood without the Article.
+                - Missing entities can appear anywhere in the new summary
+                - Never drop entities from the previous summary. If space cannot be made, add fewer new entities.
+                """,
+                },
+                {"role": "user", "content": f"Here is the Article: {article}"},
+                {
+                    "role": "user",
+                    "content": f"Here is the previous summary: {summary_chain[-1]}",
+                },
+                *missing_entity_message,
+            ],
+            max_retries=3,
+            max_tokens=1000,
+            response_model=RewrittenSummary,
+        )
+        summary_chain.append(new_summary.summary)
+        prev_summary = new_summary
+
+    return summary_chain
diff --git a/examples/chain-of-density/finetune.py b/examples/chain-of-density/finetune.py
@@ -0,0 +1,32 @@
+from typing import List
+from chain_of_density import summarize_article
+import csv
+import logging
+import instructor
+from pydantic import BaseModel
+
+logging.basicConfig(level=logging.INFO)
+
+instructions = instructor.Instructions(
+    name="Chain Of Density",
+    finetune_format="messages",
+    log_handlers=[logging.FileHandler("summarization.jsonl")],
+)
-from typing import List
-from chain_of_density import summarize_article
-import csv
-import logging
-import instructor
-from pydantic import BaseModel
-
-logging.basicConfig(level=logging.INFO)
-
-instructions = instructor.Instructions(
-    name="Chain Of Density",
-    finetune_format="messages",
-    log_handlers=[logging.FileHandler("summarization.jsonl")],
-)
+from typing import List
+from chain_of_density import summarize_article
+import csv
+import logging
+from pydantic import BaseModel
+
+logging.basicConfig(level=logging.INFO)
-from typing import List
-from chain_of_density import summarize_article
-import csv
-import logging
-import instructor
-from pydantic import BaseModel
-
-logging.basicConfig(level=logging.INFO)
-
-instructions = instructor.Instructions(
-    name="Chain Of Density",
-    finetune_format="messages",
-    log_handlers=[logging.FileHandler("summarization.jsonl")],
-)
+from typing import List
+from chain_of_density import summarize_article
+import csv
+import logging
+from pydantic import BaseModel
+
+logging.basicConfig(level=logging.INFO)
+
+
+class GeneratedSummary(BaseModel):
+    summary: str
+
+
+@instructions.distil
+def distil_summarization(text: str) -> GeneratedSummary:
+    summary_chain: List[str] = summarize_article(text)
+    return GeneratedSummary(summary=summary_chain[-1])
+
+
+# Read in the csv file we have
+with open("test.csv", "r") as file:
+    reader = csv.reader(file)
+    next(reader)  # Skip the header
+    for article, summary in reader:
+        distil_summarization(article)
-# Read in the csv file we have
-with open("test.csv", "r") as file:
-    reader = csv.reader(file)
-    next(reader)  # Skip the header
-    for article, summary in reader:
-        distil_summarization(article)
+# Read in the csv file we have
+with open("test.csv", "r") as file:
+    reader = csv.reader(file)
+    next(reader)  # Skip the header
+    summaries = []
+    for article, _ in reader:
+        summaries.append(distil_summarization(article))
-# Read in the csv file we have
-with open("test.csv", "r") as file:
-    reader = csv.reader(file)
-    next(reader)  # Skip the header
-    for article, summary in reader:
-        distil_summarization(article)
+# Read in the csv file we have
+with open("test.csv", "r") as file:
+    reader = csv.reader(file)
+    next(reader)  # Skip the header
+    summaries = []
+    for article, _ in reader:
+        summaries.append(distil_summarization(article))
diff --git a/examples/chain-of-density/requirements.txt b/examples/chain-of-density/requirements.txt
@@ -0,0 +1,5 @@
+openai
+pydantic
+instructor
+nltk
+rich