Skip to content

Commit

Permalink
langchain[minor]: Add document loader for ChatGPT data (#3439)
Browse files Browse the repository at this point in the history
* copy pasting and basic error fixing

* fixed testing issue

- fixed issue
- fixed code
- added blob tests
- fixed test timestamps

* Update chatgpt.mdx

* Update chatgpt.mdx

* Update chatgpt.ts

* Throws errors also package.json and .gitignore update

- console error logging also throws error instead of not doing that
- put chatgpt.ts related files into package.json and .gitignore
- ran `yarn lint` and `yarn format` many times to be sure

* Format

* whoops one more

* Fix test

---------

Co-authored-by: jacoblee93 <[email protected]>
  • Loading branch information
Zeneos and jacoblee93 authored Nov 30, 2023
1 parent 441adc2 commit c1c988c
Show file tree
Hide file tree
Showing 7 changed files with 579 additions and 0 deletions.
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
# ChatGPT files

This example goes over how to load conversations.json from your ChatGPT data export folder. You can get your data export by email by going to: ChatGPT -> (Profile) - Settings -> Export data -> Confirm export -> Check email.

## Usage, extracting all logs

Example code:

```typescript
import { ChatGPTLoader } from "langchain/document_loaders/fs/chatgpt";

const loader = new ChatGPTLoader("./example_data/example_conversations.json");

const docs = await loader.load();

console.log(docs);
```

## Usage, extracting a single log

Example code:

```typescript
import { ChatGPTLoader } from "langchain/document_loaders/fs/chatgpt";

const loader = new ChatGPTLoader(
"./example_data/example_conversations.json",
1
);

const docs = await loader.load();

console.log(docs);
```
3 changes: 3 additions & 0 deletions langchain/.gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -472,6 +472,9 @@ document_loaders/fs/openai_whisper_audio.d.ts
document_loaders/fs/pptx.cjs
document_loaders/fs/pptx.js
document_loaders/fs/pptx.d.ts
document_loaders/fs/chatgpt.cjs
document_loaders/fs/chatgpt.js
document_loaders/fs/chatgpt.d.ts
document_transformers/html_to_text.cjs
document_transformers/html_to_text.js
document_transformers/html_to_text.d.ts
Expand Down
3 changes: 3 additions & 0 deletions langchain/package.json
Original file line number Diff line number Diff line change
Expand Up @@ -484,6 +484,9 @@
"document_loaders/fs/pptx.cjs",
"document_loaders/fs/pptx.js",
"document_loaders/fs/pptx.d.ts",
"document_loaders/fs/chatgpt.cjs",
"document_loaders/fs/chatgpt.js",
"document_loaders/fs/chatgpt.d.ts",
"document_transformers/html_to_text.cjs",
"document_transformers/html_to_text.js",
"document_transformers/html_to_text.d.ts",
Expand Down
103 changes: 103 additions & 0 deletions langchain/src/document_loaders/fs/chatgpt.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,103 @@
import { TextLoader } from "./text.js";
import { Document } from "../../document.js";

interface ChatGPTMessage {
author: {
role: string;
};
content: {
parts: string[];
};
create_time: number;
}

interface ChatGPTLog {
title: string;
mapping: Record<string, { message: ChatGPTMessage }>;
}

function concatenateRows(message: ChatGPTMessage, title: string): string {
/**
* Combine message information in a readable format ready to be used.
* @param {ChatGPTMessage} message - Message to be concatenated
* @param {string} title - Title of the conversation
*
* @returns {string} Concatenated message
*/
if (!message) {
return "";
}

const sender = message.author ? message.author.role : "unknown";
const text = message.content.parts[0];
const date = new Date(message.create_time * 1000)
.toISOString()
.slice(0, 19)
.replace("T", " ");
return `${title} - ${sender} on ${date}: ${text}\n\n`;
}

export class ChatGPTLoader extends TextLoader {
public numLogs: number;

constructor(filePathOrBlob: string | Blob, numLogs = 0) {
super(filePathOrBlob);
this.numLogs = numLogs;
}

protected async parse(raw: string): Promise<string[]> {
let data;
try {
data = JSON.parse(raw);
} catch (e) {
console.error(e);
throw new Error("Failed to parse JSON");
}

const truncatedData = this.numLogs > 0 ? data.slice(0, this.numLogs) : data;

return truncatedData.map((d: ChatGPTLog) =>
Object.values(d.mapping)
.filter(
(msg, idx) => !(idx === 0 && msg.message.author.role === "system")
)
.map((msg) => concatenateRows(msg.message, d.title))
.join("")
);
}

public async load(): Promise<Document[]> {
let text: string;
let metadata: Record<string, string>;
if (typeof this.filePathOrBlob === "string") {
const { readFile } = await TextLoader.imports();
try {
text = await readFile(this.filePathOrBlob, "utf8");
} catch (e) {
console.error(e);
throw new Error("Failed to read file");
}
metadata = { source: this.filePathOrBlob };
} else {
try {
text = await this.filePathOrBlob.text();
} catch (e) {
console.error(e);
throw new Error("Failed to read blob");
}
metadata = { source: "blob", blobType: this.filePathOrBlob.type };
}

const parsed = await this.parse(text);
return parsed.map(
(pageContent, i) =>
new Document({
pageContent,
metadata: {
...metadata,
logIndex: i + 1,
},
})
);
}
}
52 changes: 52 additions & 0 deletions langchain/src/document_loaders/tests/chatgpt-blob.test.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
import * as url from "node:url";
import * as path from "node:path";
import * as fs from "node:fs/promises";
import { test, expect } from "@jest/globals";
import { Document } from "../../document.js";
import { ChatGPTLoader } from "../fs/chatgpt.js";

test("Test ChatGPT loader from blob to load all documents", async () => {
const filePath = path.resolve(
path.dirname(url.fileURLToPath(import.meta.url)),
"./example_data/chatgpt/example_conversations.json"
);
const loader = new ChatGPTLoader(
new Blob([await fs.readFile(filePath)], { type: "application/json" })
);
const docs = await loader.load();
expect(docs.length).toBe(2);
expect(docs[0]).toEqual(
new Document({
metadata: { source: "blob", blobType: "application/json", logIndex: 1 },
pageContent:
"Example Usage - user on 2023-10-16 23:40:17: Hello, what is your name?\n\nExample Usage - assistant on 2023-10-16 23:40:23: Hello! I'm just a computer program created by OpenAI, so I don't have a personal name. You can call me ChatGPT or simply ask me any questions or chat about topics you're interested in. How can I assist you today?\n\n",
})
);
expect(docs[1]).toEqual(
new Document({
metadata: { source: "blob", blobType: "application/json", logIndex: 2 },
pageContent:
"Example Usage 2 - user on 2023-10-13 23:02:19: What should I do today?\n\nExample Usage 2 - assistant on 2023-10-13 23:02:27: You should contribute to LangChain!\n\nExample Usage 2 - user on 2023-10-13 23:03:30: How can I start?\n\nExample Usage 2 - assistant on 2023-10-13 23:03:38: You can take a look at the current LangChain issues and see if you can contribute to any! Don't forget to read the contributing.md file.\n\nExample Usage 2 - user on 2023-10-13 23:09:24: Thank you!\n\nExample Usage 2 - assistant on 2023-10-13 23:09:34: You're welcome! If you have any more questions or need further assistance in the future, feel free to reach out.\n\n",
})
);
});

test("Test ChatGPT loader from blob to only load 1 document", async () => {
const filePath = path.resolve(
path.dirname(url.fileURLToPath(import.meta.url)),
"./example_data/chatgpt/example_conversations.json"
);
const loader = new ChatGPTLoader(
new Blob([await fs.readFile(filePath)], { type: "application/json" }),
1
);
const docs = await loader.load();
expect(docs.length).toBe(1);
expect(docs[0]).toEqual(
new Document({
metadata: { source: "blob", blobType: "application/json", logIndex: 1 },
pageContent:
"Example Usage - user on 2023-10-16 23:40:17: Hello, what is your name?\n\nExample Usage - assistant on 2023-10-16 23:40:23: Hello! I'm just a computer program created by OpenAI, so I don't have a personal name. You can call me ChatGPT or simply ask me any questions or chat about topics you're interested in. How can I assist you today?\n\n",
})
);
});
46 changes: 46 additions & 0 deletions langchain/src/document_loaders/tests/chatgpt.test.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
import * as url from "node:url";
import * as path from "node:path";
import { test, expect } from "@jest/globals";
import { Document } from "../../document.js";
import { ChatGPTLoader } from "../fs/chatgpt.js";

test("Test ChatGPT loader to load all documents", async () => {
const filePath = path.resolve(
path.dirname(url.fileURLToPath(import.meta.url)),
"./example_data/chatgpt/example_conversations.json"
);
const loader = new ChatGPTLoader(filePath);
const docs = await loader.load();
expect(docs.length).toBe(2);
expect(docs[0]).toEqual(
new Document({
metadata: { source: filePath, logIndex: 1 },
pageContent:
"Example Usage - user on 2023-10-16 23:40:17: Hello, what is your name?\n\nExample Usage - assistant on 2023-10-16 23:40:23: Hello! I'm just a computer program created by OpenAI, so I don't have a personal name. You can call me ChatGPT or simply ask me any questions or chat about topics you're interested in. How can I assist you today?\n\n",
})
);
expect(docs[1]).toEqual(
new Document({
metadata: { source: filePath, logIndex: 2 },
pageContent:
"Example Usage 2 - user on 2023-10-13 23:02:19: What should I do today?\n\nExample Usage 2 - assistant on 2023-10-13 23:02:27: You should contribute to LangChain!\n\nExample Usage 2 - user on 2023-10-13 23:03:30: How can I start?\n\nExample Usage 2 - assistant on 2023-10-13 23:03:38: You can take a look at the current LangChain issues and see if you can contribute to any! Don't forget to read the contributing.md file.\n\nExample Usage 2 - user on 2023-10-13 23:09:24: Thank you!\n\nExample Usage 2 - assistant on 2023-10-13 23:09:34: You're welcome! If you have any more questions or need further assistance in the future, feel free to reach out.\n\n",
})
);
});

test("Test ChatGPT loader to only load 1 document", async () => {
const filePath = path.resolve(
path.dirname(url.fileURLToPath(import.meta.url)),
"./example_data/chatgpt/example_conversations.json"
);
const loader = new ChatGPTLoader(filePath, 1);
const docs = await loader.load();
expect(docs.length).toBe(1);
expect(docs[0]).toEqual(
new Document({
metadata: { source: filePath, logIndex: 1 },
pageContent:
"Example Usage - user on 2023-10-16 23:40:17: Hello, what is your name?\n\nExample Usage - assistant on 2023-10-16 23:40:23: Hello! I'm just a computer program created by OpenAI, so I don't have a personal name. You can call me ChatGPT or simply ask me any questions or chat about topics you're interested in. How can I assist you today?\n\n",
})
);
});
Loading

1 comment on commit c1c988c

@vercel
Copy link

@vercel vercel bot commented on c1c988c Nov 30, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Please sign in to comment.