-
Notifications
You must be signed in to change notification settings - Fork 2.3k
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
langchain[minor]: Add document loader for ChatGPT data (#3439)
* copy pasting and basic error fixing * fixed testing issue - fixed issue - fixed code - added blob tests - fixed test timestamps * Update chatgpt.mdx * Update chatgpt.mdx * Update chatgpt.ts * Throws errors also package.json and .gitignore update - console error logging also throws error instead of not doing that - put chatgpt.ts related files into package.json and .gitignore - ran `yarn lint` and `yarn format` many times to be sure * Format * whoops one more * Fix test --------- Co-authored-by: jacoblee93 <[email protected]>
- Loading branch information
1 parent
441adc2
commit c1c988c
Showing
7 changed files
with
579 additions
and
0 deletions.
There are no files selected for viewing
34 changes: 34 additions & 0 deletions
34
docs/core_docs/docs/integrations/document_loaders/file_loaders/chatgpt.mdx
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,34 @@ | ||
# ChatGPT files | ||
|
||
This example goes over how to load conversations.json from your ChatGPT data export folder. You can get your data export by email by going to: ChatGPT -> (Profile) - Settings -> Export data -> Confirm export -> Check email. | ||
|
||
## Usage, extracting all logs | ||
|
||
Example code: | ||
|
||
```typescript | ||
import { ChatGPTLoader } from "langchain/document_loaders/fs/chatgpt"; | ||
|
||
const loader = new ChatGPTLoader("./example_data/example_conversations.json"); | ||
|
||
const docs = await loader.load(); | ||
|
||
console.log(docs); | ||
``` | ||
|
||
## Usage, extracting a single log | ||
|
||
Example code: | ||
|
||
```typescript | ||
import { ChatGPTLoader } from "langchain/document_loaders/fs/chatgpt"; | ||
|
||
const loader = new ChatGPTLoader( | ||
"./example_data/example_conversations.json", | ||
1 | ||
); | ||
|
||
const docs = await loader.load(); | ||
|
||
console.log(docs); | ||
``` |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,103 @@ | ||
import { TextLoader } from "./text.js"; | ||
import { Document } from "../../document.js"; | ||
|
||
interface ChatGPTMessage { | ||
author: { | ||
role: string; | ||
}; | ||
content: { | ||
parts: string[]; | ||
}; | ||
create_time: number; | ||
} | ||
|
||
interface ChatGPTLog { | ||
title: string; | ||
mapping: Record<string, { message: ChatGPTMessage }>; | ||
} | ||
|
||
function concatenateRows(message: ChatGPTMessage, title: string): string { | ||
/** | ||
* Combine message information in a readable format ready to be used. | ||
* @param {ChatGPTMessage} message - Message to be concatenated | ||
* @param {string} title - Title of the conversation | ||
* | ||
* @returns {string} Concatenated message | ||
*/ | ||
if (!message) { | ||
return ""; | ||
} | ||
|
||
const sender = message.author ? message.author.role : "unknown"; | ||
const text = message.content.parts[0]; | ||
const date = new Date(message.create_time * 1000) | ||
.toISOString() | ||
.slice(0, 19) | ||
.replace("T", " "); | ||
return `${title} - ${sender} on ${date}: ${text}\n\n`; | ||
} | ||
|
||
export class ChatGPTLoader extends TextLoader { | ||
public numLogs: number; | ||
|
||
constructor(filePathOrBlob: string | Blob, numLogs = 0) { | ||
super(filePathOrBlob); | ||
this.numLogs = numLogs; | ||
} | ||
|
||
protected async parse(raw: string): Promise<string[]> { | ||
let data; | ||
try { | ||
data = JSON.parse(raw); | ||
} catch (e) { | ||
console.error(e); | ||
throw new Error("Failed to parse JSON"); | ||
} | ||
|
||
const truncatedData = this.numLogs > 0 ? data.slice(0, this.numLogs) : data; | ||
|
||
return truncatedData.map((d: ChatGPTLog) => | ||
Object.values(d.mapping) | ||
.filter( | ||
(msg, idx) => !(idx === 0 && msg.message.author.role === "system") | ||
) | ||
.map((msg) => concatenateRows(msg.message, d.title)) | ||
.join("") | ||
); | ||
} | ||
|
||
public async load(): Promise<Document[]> { | ||
let text: string; | ||
let metadata: Record<string, string>; | ||
if (typeof this.filePathOrBlob === "string") { | ||
const { readFile } = await TextLoader.imports(); | ||
try { | ||
text = await readFile(this.filePathOrBlob, "utf8"); | ||
} catch (e) { | ||
console.error(e); | ||
throw new Error("Failed to read file"); | ||
} | ||
metadata = { source: this.filePathOrBlob }; | ||
} else { | ||
try { | ||
text = await this.filePathOrBlob.text(); | ||
} catch (e) { | ||
console.error(e); | ||
throw new Error("Failed to read blob"); | ||
} | ||
metadata = { source: "blob", blobType: this.filePathOrBlob.type }; | ||
} | ||
|
||
const parsed = await this.parse(text); | ||
return parsed.map( | ||
(pageContent, i) => | ||
new Document({ | ||
pageContent, | ||
metadata: { | ||
...metadata, | ||
logIndex: i + 1, | ||
}, | ||
}) | ||
); | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,52 @@ | ||
import * as url from "node:url"; | ||
import * as path from "node:path"; | ||
import * as fs from "node:fs/promises"; | ||
import { test, expect } from "@jest/globals"; | ||
import { Document } from "../../document.js"; | ||
import { ChatGPTLoader } from "../fs/chatgpt.js"; | ||
|
||
test("Test ChatGPT loader from blob to load all documents", async () => { | ||
const filePath = path.resolve( | ||
path.dirname(url.fileURLToPath(import.meta.url)), | ||
"./example_data/chatgpt/example_conversations.json" | ||
); | ||
const loader = new ChatGPTLoader( | ||
new Blob([await fs.readFile(filePath)], { type: "application/json" }) | ||
); | ||
const docs = await loader.load(); | ||
expect(docs.length).toBe(2); | ||
expect(docs[0]).toEqual( | ||
new Document({ | ||
metadata: { source: "blob", blobType: "application/json", logIndex: 1 }, | ||
pageContent: | ||
"Example Usage - user on 2023-10-16 23:40:17: Hello, what is your name?\n\nExample Usage - assistant on 2023-10-16 23:40:23: Hello! I'm just a computer program created by OpenAI, so I don't have a personal name. You can call me ChatGPT or simply ask me any questions or chat about topics you're interested in. How can I assist you today?\n\n", | ||
}) | ||
); | ||
expect(docs[1]).toEqual( | ||
new Document({ | ||
metadata: { source: "blob", blobType: "application/json", logIndex: 2 }, | ||
pageContent: | ||
"Example Usage 2 - user on 2023-10-13 23:02:19: What should I do today?\n\nExample Usage 2 - assistant on 2023-10-13 23:02:27: You should contribute to LangChain!\n\nExample Usage 2 - user on 2023-10-13 23:03:30: How can I start?\n\nExample Usage 2 - assistant on 2023-10-13 23:03:38: You can take a look at the current LangChain issues and see if you can contribute to any! Don't forget to read the contributing.md file.\n\nExample Usage 2 - user on 2023-10-13 23:09:24: Thank you!\n\nExample Usage 2 - assistant on 2023-10-13 23:09:34: You're welcome! If you have any more questions or need further assistance in the future, feel free to reach out.\n\n", | ||
}) | ||
); | ||
}); | ||
|
||
test("Test ChatGPT loader from blob to only load 1 document", async () => { | ||
const filePath = path.resolve( | ||
path.dirname(url.fileURLToPath(import.meta.url)), | ||
"./example_data/chatgpt/example_conversations.json" | ||
); | ||
const loader = new ChatGPTLoader( | ||
new Blob([await fs.readFile(filePath)], { type: "application/json" }), | ||
1 | ||
); | ||
const docs = await loader.load(); | ||
expect(docs.length).toBe(1); | ||
expect(docs[0]).toEqual( | ||
new Document({ | ||
metadata: { source: "blob", blobType: "application/json", logIndex: 1 }, | ||
pageContent: | ||
"Example Usage - user on 2023-10-16 23:40:17: Hello, what is your name?\n\nExample Usage - assistant on 2023-10-16 23:40:23: Hello! I'm just a computer program created by OpenAI, so I don't have a personal name. You can call me ChatGPT or simply ask me any questions or chat about topics you're interested in. How can I assist you today?\n\n", | ||
}) | ||
); | ||
}); |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,46 @@ | ||
import * as url from "node:url"; | ||
import * as path from "node:path"; | ||
import { test, expect } from "@jest/globals"; | ||
import { Document } from "../../document.js"; | ||
import { ChatGPTLoader } from "../fs/chatgpt.js"; | ||
|
||
test("Test ChatGPT loader to load all documents", async () => { | ||
const filePath = path.resolve( | ||
path.dirname(url.fileURLToPath(import.meta.url)), | ||
"./example_data/chatgpt/example_conversations.json" | ||
); | ||
const loader = new ChatGPTLoader(filePath); | ||
const docs = await loader.load(); | ||
expect(docs.length).toBe(2); | ||
expect(docs[0]).toEqual( | ||
new Document({ | ||
metadata: { source: filePath, logIndex: 1 }, | ||
pageContent: | ||
"Example Usage - user on 2023-10-16 23:40:17: Hello, what is your name?\n\nExample Usage - assistant on 2023-10-16 23:40:23: Hello! I'm just a computer program created by OpenAI, so I don't have a personal name. You can call me ChatGPT or simply ask me any questions or chat about topics you're interested in. How can I assist you today?\n\n", | ||
}) | ||
); | ||
expect(docs[1]).toEqual( | ||
new Document({ | ||
metadata: { source: filePath, logIndex: 2 }, | ||
pageContent: | ||
"Example Usage 2 - user on 2023-10-13 23:02:19: What should I do today?\n\nExample Usage 2 - assistant on 2023-10-13 23:02:27: You should contribute to LangChain!\n\nExample Usage 2 - user on 2023-10-13 23:03:30: How can I start?\n\nExample Usage 2 - assistant on 2023-10-13 23:03:38: You can take a look at the current LangChain issues and see if you can contribute to any! Don't forget to read the contributing.md file.\n\nExample Usage 2 - user on 2023-10-13 23:09:24: Thank you!\n\nExample Usage 2 - assistant on 2023-10-13 23:09:34: You're welcome! If you have any more questions or need further assistance in the future, feel free to reach out.\n\n", | ||
}) | ||
); | ||
}); | ||
|
||
test("Test ChatGPT loader to only load 1 document", async () => { | ||
const filePath = path.resolve( | ||
path.dirname(url.fileURLToPath(import.meta.url)), | ||
"./example_data/chatgpt/example_conversations.json" | ||
); | ||
const loader = new ChatGPTLoader(filePath, 1); | ||
const docs = await loader.load(); | ||
expect(docs.length).toBe(1); | ||
expect(docs[0]).toEqual( | ||
new Document({ | ||
metadata: { source: filePath, logIndex: 1 }, | ||
pageContent: | ||
"Example Usage - user on 2023-10-16 23:40:17: Hello, what is your name?\n\nExample Usage - assistant on 2023-10-16 23:40:23: Hello! I'm just a computer program created by OpenAI, so I don't have a personal name. You can call me ChatGPT or simply ask me any questions or chat about topics you're interested in. How can I assist you today?\n\n", | ||
}) | ||
); | ||
}); |
Oops, something went wrong.
c1c988c
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Successfully deployed to the following URLs:
langchainjs-api-refs – ./docs/api_refs
langchainjs-api-refs-git-main-langchain.vercel.app
langchainjs-api-refs-langchain.vercel.app
langchainjs-api-docs.vercel.app
api.js.langchain.com