From b61d319e02cdd75322b91b6db79acc18a2865702 Mon Sep 17 00:00:00 2001 From: Raj Mehta Date: Mon, 2 Sep 2024 11:30:38 -0700 Subject: [PATCH 01/11] Document Date Reranker --- langchain/src/retrievers/recency_ranked.ts | 66 +++++++++++++++++++ .../retrievers/tests/recency_ranked.test.ts | 40 +++++++++++ 2 files changed, 106 insertions(+) create mode 100644 langchain/src/retrievers/recency_ranked.ts create mode 100644 langchain/src/retrievers/tests/recency_ranked.test.ts diff --git a/langchain/src/retrievers/recency_ranked.ts b/langchain/src/retrievers/recency_ranked.ts new file mode 100644 index 000000000000..5f68cd61d5d3 --- /dev/null +++ b/langchain/src/retrievers/recency_ranked.ts @@ -0,0 +1,66 @@ +import { BaseRetriever } from "@langchain/core/retrievers"; +import { VectorStoreInterface } from "@langchain/core/vectorstores"; +import { Document } from "@langchain/core/documents"; + +export interface RecencyRankedRetrieverConfig { + vectorStore: VectorStoreInterface; + k: number; + recencyWeight?: number; +} + +export class RecencyRankedRetriever extends BaseRetriever { + static lc_name() { + return "RecencyRankedRetriever"; + } + + lc_namespace = ["langchain", "retrievers", "recency_ranked"]; + + private vectorStore: VectorStoreInterface; + + private k: number; + + private recencyWeight: number; + + constructor(config: RecencyRankedRetrieverConfig) { + super(); + this.vectorStore = config.vectorStore; + this.k = config.k; + this.recencyWeight = config.recencyWeight ?? 0.3; + } + + async getRelevantDocuments(query: string): Promise { + const relevantDocs = await this.vectorStore.similaritySearchWithScore(query, 15); + const rerankedDocs = this.recentDocumentRanker(relevantDocs, this.k, this.recencyWeight); + return rerankedDocs.map(([doc, _]) => doc); + } + + private recentDocumentRanker( + documents: [Document, number][], + topK: number, + recencyWeight: number + ): [Document, number][] { + if (documents.length === 0) return []; + + const oldestDate = Math.min( + ...documents.map(([doc, _]) => doc.metadata.date.getTime()) + ); + const newestDate = Math.max( + ...documents.map(([doc, _]) => doc.metadata.date.getTime()) + ); + const dateRange = newestDate - oldestDate; + + const rerankedDocuments = documents + .map(([doc, score]): [Document, number] => { + const normalizedRecency = + dateRange > 0 + ? (doc.metadata.date.getTime() - oldestDate) / dateRange + : 1; + const adjustedScore = + (1 - recencyWeight) * score + recencyWeight * normalizedRecency; + return [doc, adjustedScore]; + }) + .sort((a, b) => b[1] - a[1]); + + return rerankedDocuments.slice(0, topK); + } +} \ No newline at end of file diff --git a/langchain/src/retrievers/tests/recency_ranked.test.ts b/langchain/src/retrievers/tests/recency_ranked.test.ts new file mode 100644 index 000000000000..13bf6cc435b4 --- /dev/null +++ b/langchain/src/retrievers/tests/recency_ranked.test.ts @@ -0,0 +1,40 @@ +import { expect, test } from "@jest/globals"; +import { Document } from "@langchain/core/documents"; +import { FakeEmbeddings } from "@langchain/core/utils/testing"; +import { MemoryVectorStore } from "../../vectorstores/memory.js" +import { RecencyRankedRetriever } from "../recency_ranked.js"; + + + +test("RecencyRankedRetriever", async () => { + const docs = [ + new Document({ + pageContent: "A", + metadata: { date: new Date("2023-01-01") }, + }), + new Document({ + pageContent: "B", + metadata: { date: new Date("2023-02-01") }, + }), + new Document({ + pageContent: "C", + metadata: { date: new Date("2023-03-01") }, + }), + ]; + + const vectorstore = new MemoryVectorStore(new FakeEmbeddings()); + + await vectorstore.addDocuments(docs); + + const retriever = new RecencyRankedRetriever({ + vectorStore: vectorstore, + k: 2, + recencyWeight: 0.99, + }); + + const results = await retriever.getRelevantDocuments("test query"); + + expect(results).toHaveLength(2); + expect(results[0].pageContent).toBe("C"); + expect(results[1].pageContent).toBe("B"); +}); \ No newline at end of file From 41a36d72cf424812fe25c496bffde16fd57f3192 Mon Sep 17 00:00:00 2001 From: Raj Mehta Date: Mon, 2 Sep 2024 11:35:01 -0700 Subject: [PATCH 02/11] Revert "Document Date Reranker" This reverts commit b61d319e02cdd75322b91b6db79acc18a2865702. --- langchain/src/retrievers/recency_ranked.ts | 66 ------------------- .../retrievers/tests/recency_ranked.test.ts | 40 ----------- 2 files changed, 106 deletions(-) delete mode 100644 langchain/src/retrievers/recency_ranked.ts delete mode 100644 langchain/src/retrievers/tests/recency_ranked.test.ts diff --git a/langchain/src/retrievers/recency_ranked.ts b/langchain/src/retrievers/recency_ranked.ts deleted file mode 100644 index 5f68cd61d5d3..000000000000 --- a/langchain/src/retrievers/recency_ranked.ts +++ /dev/null @@ -1,66 +0,0 @@ -import { BaseRetriever } from "@langchain/core/retrievers"; -import { VectorStoreInterface } from "@langchain/core/vectorstores"; -import { Document } from "@langchain/core/documents"; - -export interface RecencyRankedRetrieverConfig { - vectorStore: VectorStoreInterface; - k: number; - recencyWeight?: number; -} - -export class RecencyRankedRetriever extends BaseRetriever { - static lc_name() { - return "RecencyRankedRetriever"; - } - - lc_namespace = ["langchain", "retrievers", "recency_ranked"]; - - private vectorStore: VectorStoreInterface; - - private k: number; - - private recencyWeight: number; - - constructor(config: RecencyRankedRetrieverConfig) { - super(); - this.vectorStore = config.vectorStore; - this.k = config.k; - this.recencyWeight = config.recencyWeight ?? 0.3; - } - - async getRelevantDocuments(query: string): Promise { - const relevantDocs = await this.vectorStore.similaritySearchWithScore(query, 15); - const rerankedDocs = this.recentDocumentRanker(relevantDocs, this.k, this.recencyWeight); - return rerankedDocs.map(([doc, _]) => doc); - } - - private recentDocumentRanker( - documents: [Document, number][], - topK: number, - recencyWeight: number - ): [Document, number][] { - if (documents.length === 0) return []; - - const oldestDate = Math.min( - ...documents.map(([doc, _]) => doc.metadata.date.getTime()) - ); - const newestDate = Math.max( - ...documents.map(([doc, _]) => doc.metadata.date.getTime()) - ); - const dateRange = newestDate - oldestDate; - - const rerankedDocuments = documents - .map(([doc, score]): [Document, number] => { - const normalizedRecency = - dateRange > 0 - ? (doc.metadata.date.getTime() - oldestDate) / dateRange - : 1; - const adjustedScore = - (1 - recencyWeight) * score + recencyWeight * normalizedRecency; - return [doc, adjustedScore]; - }) - .sort((a, b) => b[1] - a[1]); - - return rerankedDocuments.slice(0, topK); - } -} \ No newline at end of file diff --git a/langchain/src/retrievers/tests/recency_ranked.test.ts b/langchain/src/retrievers/tests/recency_ranked.test.ts deleted file mode 100644 index 13bf6cc435b4..000000000000 --- a/langchain/src/retrievers/tests/recency_ranked.test.ts +++ /dev/null @@ -1,40 +0,0 @@ -import { expect, test } from "@jest/globals"; -import { Document } from "@langchain/core/documents"; -import { FakeEmbeddings } from "@langchain/core/utils/testing"; -import { MemoryVectorStore } from "../../vectorstores/memory.js" -import { RecencyRankedRetriever } from "../recency_ranked.js"; - - - -test("RecencyRankedRetriever", async () => { - const docs = [ - new Document({ - pageContent: "A", - metadata: { date: new Date("2023-01-01") }, - }), - new Document({ - pageContent: "B", - metadata: { date: new Date("2023-02-01") }, - }), - new Document({ - pageContent: "C", - metadata: { date: new Date("2023-03-01") }, - }), - ]; - - const vectorstore = new MemoryVectorStore(new FakeEmbeddings()); - - await vectorstore.addDocuments(docs); - - const retriever = new RecencyRankedRetriever({ - vectorStore: vectorstore, - k: 2, - recencyWeight: 0.99, - }); - - const results = await retriever.getRelevantDocuments("test query"); - - expect(results).toHaveLength(2); - expect(results[0].pageContent).toBe("C"); - expect(results[1].pageContent).toBe("B"); -}); \ No newline at end of file From d904ce68e5cbeea9a10f2812393fa21e2c44dc56 Mon Sep 17 00:00:00 2001 From: Raj Mehta Date: Sun, 8 Sep 2024 18:13:20 -0700 Subject: [PATCH 03/11] bm25 search implementation --- langchain/package.json | 1 + langchain/src/retrievers/bm25_retriever.ts | 61 +++++++++++++++++++ .../retrievers/tests/bm25_retriever.test.ts | 26 ++++++++ libs/langchain-community/package.json | 1 + yarn.lock | 9 +++ 5 files changed, 98 insertions(+) create mode 100644 langchain/src/retrievers/bm25_retriever.ts create mode 100644 langchain/src/retrievers/tests/bm25_retriever.test.ts diff --git a/langchain/package.json b/langchain/package.json index 3465af3054fe..2058722452d5 100644 --- a/langchain/package.json +++ b/langchain/package.json @@ -934,6 +934,7 @@ "js-yaml": "^4.1.0", "jsonpointer": "^5.0.1", "langsmith": "~0.1.40", + "okapibm25": "^1.4.0", "openapi-types": "^12.1.3", "p-retry": "4", "uuid": "^10.0.0", diff --git a/langchain/src/retrievers/bm25_retriever.ts b/langchain/src/retrievers/bm25_retriever.ts new file mode 100644 index 000000000000..0adc557a00b4 --- /dev/null +++ b/langchain/src/retrievers/bm25_retriever.ts @@ -0,0 +1,61 @@ +import BM25 from "okapibm25"; +import { BaseRetriever } from "@langchain/core/retrievers"; +import { Document } from "@langchain/core/documents"; + +export type BM25RetrieverOptions = { + docs: Document[]; + k: number; + preprocessFunc: (text: string) => string[]; +}; + +/** + * A retriever that uses the BM25 algorithm to rank documents based on their + * similarity to a query. It uses the okapibm25 package for BM25 scoring. + * The k parameter determines the number of documents to return for each query. + */ +export class BM25Retriever extends BaseRetriever { + static lc_name() { + return "BM25Retriever"; + } + + lc_namespace = ["langchain", "retrievers", "bm25_retriever"]; + + static fromDocuments( + documents: Document[], + options: Omit + ) { + return new this({ ...options, docs: documents }); + } + + docs: Document[]; + + k: number; + + preprocessFunc: (text: string) => string[]; + + constructor(options: BM25RetrieverOptions) { + super(); + this.docs = options.docs; + this.k = options.k; + this.preprocessFunc = options.preprocessFunc; + } + + async _getRelevantDocuments(query: string) { + const processedQuery = this.preprocessFunc(query); + const documents = this.docs.map(doc => doc.pageContent); + const scores = BM25.default(documents, processedQuery) as number[]; + + const scoredDocs = this.docs.map((doc, index) => ({ + document: doc, + score: scores[index], + })); + + scoredDocs.sort((a, b) => b.score - a.score); + + return scoredDocs.slice(0, this.k).map(item => item.document); + } + + async invoke(input: string): Promise { + return this._getRelevantDocuments(input); + } +} diff --git a/langchain/src/retrievers/tests/bm25_retriever.test.ts b/langchain/src/retrievers/tests/bm25_retriever.test.ts new file mode 100644 index 000000000000..e2ca3185ab62 --- /dev/null +++ b/langchain/src/retrievers/tests/bm25_retriever.test.ts @@ -0,0 +1,26 @@ +import { expect, test } from "@jest/globals"; +import { Document } from "@langchain/core/documents"; +import { BM25Retriever } from "../bm25_retriever.js"; + +test("BM25Retriever", async () => { + const docs = [ + new Document({ + pageContent: "The quick brown fox jumps over the lazy dog", + }), + new Document({ + pageContent: "A lazy dog sleeps all day", + }), + new Document({ + pageContent: "The brown fox is quick and clever", + }), + ]; + + const retriever = BM25Retriever.fromDocuments(docs, { + k: 2, + preprocessFunc: (text: string) => text.toLowerCase().split(/\s+/), + }); + const results = await retriever.invoke("the fox and the dog"); + + expect(results).toHaveLength(2); + expect(results[0].pageContent).toBe("The quick brown fox jumps over the lazy dog"); +}); diff --git a/libs/langchain-community/package.json b/libs/langchain-community/package.json index 767540e6f2fd..efbad8decb6c 100644 --- a/libs/langchain-community/package.json +++ b/libs/langchain-community/package.json @@ -43,6 +43,7 @@ "js-yaml": "^4.1.0", "langchain": "~0.2.3", "langsmith": "~0.1.30", + "okapibm25": "^1.4.0", "uuid": "^10.0.0", "zod": "^3.22.3", "zod-to-json-schema": "^3.22.5" diff --git a/yarn.lock b/yarn.lock index 52ec639319f4..a8ab2849ac31 100644 --- a/yarn.lock +++ b/yarn.lock @@ -11294,6 +11294,7 @@ __metadata: node-llama-cpp: 2.7.3 notion-to-md: ^3.1.0 officeparser: ^4.0.4 + okapibm25: ^1.4.0 pdf-parse: 1.1.1 pg: ^8.11.0 pg-copy-streams: ^6.0.5 @@ -32230,6 +32231,7 @@ __metadata: node-llama-cpp: 2.7.3 notion-to-md: ^3.1.0 officeparser: ^4.0.4 + okapibm25: ^1.4.0 openai: ^4.41.1 openapi-types: ^12.1.3 p-retry: 4 @@ -35005,6 +35007,13 @@ __metadata: languageName: node linkType: hard +"okapibm25@npm:^1.4.0": + version: 1.4.0 + resolution: "okapibm25@npm:1.4.0" + checksum: 8f513ef7a05d78fc7a32d19cfbf44405c8164ddab27431f80516c8a9e00ed9b9f2a3bdbdc11633d3c0611b97f0a86b1f90bd84f6e0b0eb8424a856b34a9fd3de + languageName: node + linkType: hard + "ollama@npm:^0.5.6": version: 0.5.6 resolution: "ollama@npm:0.5.6" From a861f985c387abab3fe36a5a824a0aeb36b68ee7 Mon Sep 17 00:00:00 2001 From: Raj Mehta Date: Mon, 9 Sep 2024 08:25:10 -0700 Subject: [PATCH 04/11] fixed retriever input args --- langchain/src/retrievers/bm25_retriever.ts | 8 ++++---- langchain/src/retrievers/tests/bm25_retriever.test.ts | 1 - 2 files changed, 4 insertions(+), 5 deletions(-) diff --git a/langchain/src/retrievers/bm25_retriever.ts b/langchain/src/retrievers/bm25_retriever.ts index 0adc557a00b4..52b76602c2d6 100644 --- a/langchain/src/retrievers/bm25_retriever.ts +++ b/langchain/src/retrievers/bm25_retriever.ts @@ -5,7 +5,6 @@ import { Document } from "@langchain/core/documents"; export type BM25RetrieverOptions = { docs: Document[]; k: number; - preprocessFunc: (text: string) => string[]; }; /** @@ -31,13 +30,14 @@ export class BM25Retriever extends BaseRetriever { k: number; - preprocessFunc: (text: string) => string[]; - constructor(options: BM25RetrieverOptions) { super(); this.docs = options.docs; this.k = options.k; - this.preprocessFunc = options.preprocessFunc; + } + + private preprocessFunc(text: string): string[] { + return text.toLowerCase().split(/\s+/); } async _getRelevantDocuments(query: string) { diff --git a/langchain/src/retrievers/tests/bm25_retriever.test.ts b/langchain/src/retrievers/tests/bm25_retriever.test.ts index e2ca3185ab62..495e9cb079d8 100644 --- a/langchain/src/retrievers/tests/bm25_retriever.test.ts +++ b/langchain/src/retrievers/tests/bm25_retriever.test.ts @@ -17,7 +17,6 @@ test("BM25Retriever", async () => { const retriever = BM25Retriever.fromDocuments(docs, { k: 2, - preprocessFunc: (text: string) => text.toLowerCase().split(/\s+/), }); const results = await retriever.invoke("the fox and the dog"); From 696de7a237014bece04196a52e738323fc988358 Mon Sep 17 00:00:00 2001 From: jacoblee93 Date: Tue, 10 Sep 2024 14:12:09 -0700 Subject: [PATCH 05/11] Update dep --- langchain/package.json | 6 +++++- yarn.lock | 12 +++++++++++- 2 files changed, 16 insertions(+), 2 deletions(-) diff --git a/langchain/package.json b/langchain/package.json index 2058722452d5..4fb8016571ca 100644 --- a/langchain/package.json +++ b/langchain/package.json @@ -664,6 +664,7 @@ "node-llama-cpp": "2.7.3", "notion-to-md": "^3.1.0", "officeparser": "^4.0.4", + "okapibm25": "^1.4.1", "openai": "^4.41.1", "pdf-parse": "1.1.1", "peggy": "^3.0.2", @@ -731,6 +732,7 @@ "node-llama-cpp": "*", "notion-to-md": "*", "officeparser": "*", + "okapibm25": "^1.4.1", "pdf-parse": "*", "peggy": "^3.0.2", "playwright": "*", @@ -882,6 +884,9 @@ "officeparser": { "optional": true }, + "okapibm25": { + "optional": true + }, "pdf-parse": { "optional": true }, @@ -934,7 +939,6 @@ "js-yaml": "^4.1.0", "jsonpointer": "^5.0.1", "langsmith": "~0.1.40", - "okapibm25": "^1.4.0", "openapi-types": "^12.1.3", "p-retry": "4", "uuid": "^10.0.0", diff --git a/yarn.lock b/yarn.lock index a8ab2849ac31..5746accf232c 100644 --- a/yarn.lock +++ b/yarn.lock @@ -32231,7 +32231,7 @@ __metadata: node-llama-cpp: 2.7.3 notion-to-md: ^3.1.0 officeparser: ^4.0.4 - okapibm25: ^1.4.0 + okapibm25: ^1.4.1 openai: ^4.41.1 openapi-types: ^12.1.3 p-retry: 4 @@ -32304,6 +32304,7 @@ __metadata: node-llama-cpp: "*" notion-to-md: "*" officeparser: "*" + okapibm25: ^1.4.1 pdf-parse: "*" peggy: ^3.0.2 playwright: "*" @@ -32409,6 +32410,8 @@ __metadata: optional: true officeparser: optional: true + okapibm25: + optional: true pdf-parse: optional: true peggy: @@ -35014,6 +35017,13 @@ __metadata: languageName: node linkType: hard +"okapibm25@npm:^1.4.1": + version: 1.4.1 + resolution: "okapibm25@npm:1.4.1" + checksum: f619b8888fa983861116f2fd4f2896411d157a75c5ae79d81fd9322821d274f127e0199e0f0b93961c49b6f31b710723ffd893a77820bdd0e52c859c2cbcb594 + languageName: node + linkType: hard + "ollama@npm:^0.5.6": version: 0.5.6 resolution: "ollama@npm:0.5.6" From 42cac713bbd0c5ae2b6cd09e5ec3e15a7faaf78a Mon Sep 17 00:00:00 2001 From: jacoblee93 Date: Tue, 10 Sep 2024 14:32:51 -0700 Subject: [PATCH 06/11] Move to community --- langchain/package.json | 5 ----- libs/langchain-community/langchain.config.js | 2 ++ libs/langchain-community/package.json | 5 +++++ .../src/retrievers/bm25.ts | 21 ++++++++++--------- .../src/retrievers/tests/bm25.test.ts | 6 ++++-- yarn.lock | 16 ++++---------- 6 files changed, 26 insertions(+), 29 deletions(-) rename langchain/src/retrievers/bm25_retriever.ts => libs/langchain-community/src/retrievers/bm25.ts (78%) rename langchain/src/retrievers/tests/bm25_retriever.test.ts => libs/langchain-community/src/retrievers/tests/bm25.test.ts (80%) diff --git a/langchain/package.json b/langchain/package.json index 140c5126125f..91c99b175d6e 100644 --- a/langchain/package.json +++ b/langchain/package.json @@ -664,7 +664,6 @@ "node-llama-cpp": "2.7.3", "notion-to-md": "^3.1.0", "officeparser": "^4.0.4", - "okapibm25": "^1.4.1", "openai": "^4.41.1", "pdf-parse": "1.1.1", "peggy": "^3.0.2", @@ -732,7 +731,6 @@ "node-llama-cpp": "*", "notion-to-md": "*", "officeparser": "*", - "okapibm25": "^1.4.1", "pdf-parse": "*", "peggy": "^3.0.2", "playwright": "*", @@ -884,9 +882,6 @@ "officeparser": { "optional": true }, - "okapibm25": { - "optional": true - }, "pdf-parse": { "optional": true }, diff --git a/libs/langchain-community/langchain.config.js b/libs/langchain-community/langchain.config.js index 3709f58812e7..85ab7194bfb6 100644 --- a/libs/langchain-community/langchain.config.js +++ b/libs/langchain-community/langchain.config.js @@ -194,6 +194,7 @@ export const config = { // retrievers "retrievers/amazon_kendra": "retrievers/amazon_kendra", "retrievers/amazon_knowledge_base": "retrievers/amazon_knowledge_base", + "retrievers/bm25": "retrievers/bm25", "retrievers/chaindesk": "retrievers/chaindesk", "retrievers/databerry": "retrievers/databerry", "retrievers/dria": "retrievers/dria", @@ -426,6 +427,7 @@ export const config = { "chat_models/zhipuai", "retrievers/amazon_kendra", "retrievers/amazon_knowledge_base", + "retrievers/bm25", "retrievers/dria", "retrievers/metal", "retrievers/supabase", diff --git a/libs/langchain-community/package.json b/libs/langchain-community/package.json index 0bf4c1e0607f..b2cf4e39cea9 100644 --- a/libs/langchain-community/package.json +++ b/libs/langchain-community/package.json @@ -187,6 +187,7 @@ "node-llama-cpp": "2.7.3", "notion-to-md": "^3.1.0", "officeparser": "^4.0.4", + "okapibm25": "^1.4.1", "pdf-parse": "1.1.1", "pg": "^8.11.0", "pg-copy-streams": "^6.0.5", @@ -313,6 +314,7 @@ "node-llama-cpp": "*", "notion-to-md": "^3.1.0", "officeparser": "^4.0.4", + "okapibm25": "^1.4.1", "pdf-parse": "1.1.1", "pg": "^8.11.0", "pg-copy-streams": "^6.0.5", @@ -633,6 +635,9 @@ "officeparser": { "optional": true }, + "okapibm25": { + "optional": true + }, "pdf-parse": { "optional": true }, diff --git a/langchain/src/retrievers/bm25_retriever.ts b/libs/langchain-community/src/retrievers/bm25.ts similarity index 78% rename from langchain/src/retrievers/bm25_retriever.ts rename to libs/langchain-community/src/retrievers/bm25.ts index 52b76602c2d6..48d1562f27cf 100644 --- a/langchain/src/retrievers/bm25_retriever.ts +++ b/libs/langchain-community/src/retrievers/bm25.ts @@ -9,16 +9,16 @@ export type BM25RetrieverOptions = { /** * A retriever that uses the BM25 algorithm to rank documents based on their - * similarity to a query. It uses the okapibm25 package for BM25 scoring. + * similarity to a query. It uses the "okapibm25" package for BM25 scoring. * The k parameter determines the number of documents to return for each query. */ export class BM25Retriever extends BaseRetriever { static lc_name() { return "BM25Retriever"; } - + lc_namespace = ["langchain", "retrievers", "bm25_retriever"]; - + static fromDocuments( documents: Document[], options: Omit @@ -29,7 +29,7 @@ export class BM25Retriever extends BaseRetriever { docs: Document[]; k: number; - + constructor(options: BM25RetrieverOptions) { super(); this.docs = options.docs; @@ -42,17 +42,18 @@ export class BM25Retriever extends BaseRetriever { async _getRelevantDocuments(query: string) { const processedQuery = this.preprocessFunc(query); - const documents = this.docs.map(doc => doc.pageContent); - const scores = BM25.default(documents, processedQuery) as number[]; - + const documents = this.docs.map((doc) => doc.pageContent); + // eslint-disable-next-line @typescript-eslint/no-explicit-any + const scores = (BM25 as any).default(documents, processedQuery) as number[]; + const scoredDocs = this.docs.map((doc, index) => ({ document: doc, score: scores[index], })); - + scoredDocs.sort((a, b) => b.score - a.score); - - return scoredDocs.slice(0, this.k).map(item => item.document); + + return scoredDocs.slice(0, this.k).map((item) => item.document); } async invoke(input: string): Promise { diff --git a/langchain/src/retrievers/tests/bm25_retriever.test.ts b/libs/langchain-community/src/retrievers/tests/bm25.test.ts similarity index 80% rename from langchain/src/retrievers/tests/bm25_retriever.test.ts rename to libs/langchain-community/src/retrievers/tests/bm25.test.ts index 495e9cb079d8..bcfe46f940b8 100644 --- a/langchain/src/retrievers/tests/bm25_retriever.test.ts +++ b/libs/langchain-community/src/retrievers/tests/bm25.test.ts @@ -1,6 +1,6 @@ import { expect, test } from "@jest/globals"; import { Document } from "@langchain/core/documents"; -import { BM25Retriever } from "../bm25_retriever.js"; +import { BM25Retriever } from "../bm25.js"; test("BM25Retriever", async () => { const docs = [ @@ -21,5 +21,7 @@ test("BM25Retriever", async () => { const results = await retriever.invoke("the fox and the dog"); expect(results).toHaveLength(2); - expect(results[0].pageContent).toBe("The quick brown fox jumps over the lazy dog"); + expect(results[0].pageContent).toBe( + "The quick brown fox jumps over the lazy dog" + ); }); diff --git a/yarn.lock b/yarn.lock index 4bc8c68ba713..721962918d0a 100644 --- a/yarn.lock +++ b/yarn.lock @@ -11320,7 +11320,7 @@ __metadata: node-llama-cpp: 2.7.3 notion-to-md: ^3.1.0 officeparser: ^4.0.4 - okapibm25: ^1.4.0 + okapibm25: ^1.4.1 pdf-parse: 1.1.1 pg: ^8.11.0 pg-copy-streams: ^6.0.5 @@ -11449,6 +11449,7 @@ __metadata: node-llama-cpp: "*" notion-to-md: ^3.1.0 officeparser: ^4.0.4 + okapibm25: ^1.4.1 pdf-parse: 1.1.1 pg: ^8.11.0 pg-copy-streams: ^6.0.5 @@ -11669,6 +11670,8 @@ __metadata: optional: true officeparser: optional: true + okapibm25: + optional: true pdf-parse: optional: true pg: @@ -32272,7 +32275,6 @@ __metadata: node-llama-cpp: 2.7.3 notion-to-md: ^3.1.0 officeparser: ^4.0.4 - okapibm25: ^1.4.1 openai: ^4.41.1 openapi-types: ^12.1.3 p-retry: 4 @@ -32345,7 +32347,6 @@ __metadata: node-llama-cpp: "*" notion-to-md: "*" officeparser: "*" - okapibm25: ^1.4.1 pdf-parse: "*" peggy: ^3.0.2 playwright: "*" @@ -32451,8 +32452,6 @@ __metadata: optional: true officeparser: optional: true - okapibm25: - optional: true pdf-parse: optional: true peggy: @@ -35051,13 +35050,6 @@ __metadata: languageName: node linkType: hard -"okapibm25@npm:^1.4.0": - version: 1.4.0 - resolution: "okapibm25@npm:1.4.0" - checksum: 8f513ef7a05d78fc7a32d19cfbf44405c8164ddab27431f80516c8a9e00ed9b9f2a3bdbdc11633d3c0611b97f0a86b1f90bd84f6e0b0eb8424a856b34a9fd3de - languageName: node - linkType: hard - "okapibm25@npm:^1.4.1": version: 1.4.1 resolution: "okapibm25@npm:1.4.1" From 70e05707ca616013f7702aa740d90a52a9d2daf2 Mon Sep 17 00:00:00 2001 From: jacoblee93 Date: Tue, 10 Sep 2024 14:42:58 -0700 Subject: [PATCH 07/11] Remove direct dep --- libs/langchain-community/package.json | 1 - 1 file changed, 1 deletion(-) diff --git a/libs/langchain-community/package.json b/libs/langchain-community/package.json index b2cf4e39cea9..7a297de6564f 100644 --- a/libs/langchain-community/package.json +++ b/libs/langchain-community/package.json @@ -43,7 +43,6 @@ "js-yaml": "^4.1.0", "langchain": "~0.2.3", "langsmith": "~0.1.30", - "okapibm25": "^1.4.0", "uuid": "^10.0.0", "zod": "^3.22.3", "zod-to-json-schema": "^3.22.5" From af5d18fc5c06fe6da5a7ae9a8dabd1f9128dde24 Mon Sep 17 00:00:00 2001 From: jacoblee93 Date: Tue, 10 Sep 2024 14:57:33 -0700 Subject: [PATCH 08/11] Inline dep due to import issues --- .../docs/integrations/retrievers/bm25.ipynb | 101 ++++++++++++++++++ libs/langchain-community/.gitignore | 4 + libs/langchain-community/package.json | 13 +++ .../src/load/import_constants.ts | 1 + .../src/retrievers/bm25.ts | 12 +-- .../src/utils/@furkantoprak/bm25/BM25.ts | 100 +++++++++++++++++ .../src/utils/@furkantoprak/bm25/LICENSE.md | 21 ++++ 7 files changed, 246 insertions(+), 6 deletions(-) create mode 100644 docs/core_docs/docs/integrations/retrievers/bm25.ipynb create mode 100644 libs/langchain-community/src/utils/@furkantoprak/bm25/BM25.ts create mode 100644 libs/langchain-community/src/utils/@furkantoprak/bm25/LICENSE.md diff --git a/docs/core_docs/docs/integrations/retrievers/bm25.ipynb b/docs/core_docs/docs/integrations/retrievers/bm25.ipynb new file mode 100644 index 000000000000..b106554c6b40 --- /dev/null +++ b/docs/core_docs/docs/integrations/retrievers/bm25.ipynb @@ -0,0 +1,101 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# BM25\n", + "\n", + "BM25, also known as [Okapi BM25](https://en.wikipedia.org/wiki/Okapi_BM25), is a ranking function used in information retrieval systems to estimate the relevance of documents to a given search query.\n", + "\n", + "You can use it as part of your retrieval pipeline as a to rerank documents as a postprocessing step after retrieving an initial set of documents from another source.\n", + "\n", + "## Setup\n", + "\n", + "The `BM25Retriever` is exported from `@langchain/community`. You'll need to install it like this:\n", + "\n", + "```{=mdx}\n", + "import IntegrationInstallTooltip from \"@mdx_components/integration_install_tooltip.mdx\";\n", + "import Npm2Yarn from \"@theme/Npm2Yarn\";\n", + "\n", + "\n", + "\n", + "\n", + " @langchain/community @langchain/core\n", + "\n", + "```\n", + "\n", + "This retriever uses code from [`this implementation`](https://github.com/FurkanToprak/OkapiBM25) of Okapi BM25.\n", + "\n", + "## Usage\n", + "\n", + "You can now create a new retriever with previously retrieved documents:" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[\n", + " { pageContent: 'mitochondria is made of lipids', metadata: {} },\n", + " {\n", + " pageContent: 'mitochondria is the powerhouse of the cell',\n", + " metadata: {}\n", + " },\n", + " { pageContent: 'Buildings are made out of brick', metadata: {} },\n", + " { pageContent: 'Buildings are made out of wood', metadata: {} }\n", + "]\n" + ] + } + ], + "source": [ + "import { BM25Retriever } from \"@langchain/community/retrievers/bm25\";\n", + "\n", + "const retriever = BM25Retriever.fromDocuments([\n", + " { pageContent: \"Buildings are made out of brick\", metadata: {} },\n", + " { pageContent: \"Buildings are made out of wood\", metadata: {} },\n", + " { pageContent: \"Buildings are made out of stone\", metadata: {} },\n", + " { pageContent: \"Cars are made out of metal\", metadata: {} },\n", + " { pageContent: \"Cars are made out of plastic\", metadata: {} },\n", + " { pageContent: \"mitochondria is the powerhouse of the cell\", metadata: {} },\n", + " { pageContent: \"mitochondria is made of lipids\", metadata: {} },\n", + "], { k: 4 });\n", + "\n", + "// Will return the 4 documents reranked by the BM25 algorithm\n", + "await retriever.invoke(\"mitochondria\");" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "TypeScript", + "language": "typescript", + "name": "tslab" + }, + "language_info": { + "codemirror_mode": { + "mode": "typescript", + "name": "javascript", + "typescript": true + }, + "file_extension": ".ts", + "mimetype": "text/typescript", + "name": "typescript", + "version": "3.7.2" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/libs/langchain-community/.gitignore b/libs/langchain-community/.gitignore index 8f6f452a7468..1963c98f29be 100644 --- a/libs/langchain-community/.gitignore +++ b/libs/langchain-community/.gitignore @@ -618,6 +618,10 @@ retrievers/amazon_knowledge_base.cjs retrievers/amazon_knowledge_base.js retrievers/amazon_knowledge_base.d.ts retrievers/amazon_knowledge_base.d.cts +retrievers/bm25.cjs +retrievers/bm25.js +retrievers/bm25.d.ts +retrievers/bm25.d.cts retrievers/chaindesk.cjs retrievers/chaindesk.js retrievers/chaindesk.d.ts diff --git a/libs/langchain-community/package.json b/libs/langchain-community/package.json index 7a297de6564f..2ce214216c5b 100644 --- a/libs/langchain-community/package.json +++ b/libs/langchain-community/package.json @@ -2100,6 +2100,15 @@ "import": "./retrievers/amazon_knowledge_base.js", "require": "./retrievers/amazon_knowledge_base.cjs" }, + "./retrievers/bm25": { + "types": { + "import": "./retrievers/bm25.d.ts", + "require": "./retrievers/bm25.d.cts", + "default": "./retrievers/bm25.d.ts" + }, + "import": "./retrievers/bm25.js", + "require": "./retrievers/bm25.cjs" + }, "./retrievers/chaindesk": { "types": { "import": "./retrievers/chaindesk.d.ts", @@ -3687,6 +3696,10 @@ "retrievers/amazon_knowledge_base.js", "retrievers/amazon_knowledge_base.d.ts", "retrievers/amazon_knowledge_base.d.cts", + "retrievers/bm25.cjs", + "retrievers/bm25.js", + "retrievers/bm25.d.ts", + "retrievers/bm25.d.cts", "retrievers/chaindesk.cjs", "retrievers/chaindesk.js", "retrievers/chaindesk.d.ts", diff --git a/libs/langchain-community/src/load/import_constants.ts b/libs/langchain-community/src/load/import_constants.ts index 25ef64e5bac1..af2c93648da2 100644 --- a/libs/langchain-community/src/load/import_constants.ts +++ b/libs/langchain-community/src/load/import_constants.ts @@ -101,6 +101,7 @@ export const optionalImportEntrypoints: string[] = [ "langchain_community/callbacks/handlers/upstash_ratelimit", "langchain_community/retrievers/amazon_kendra", "langchain_community/retrievers/amazon_knowledge_base", + "langchain_community/retrievers/bm25", "langchain_community/retrievers/dria", "langchain_community/retrievers/metal", "langchain_community/retrievers/supabase", diff --git a/libs/langchain-community/src/retrievers/bm25.ts b/libs/langchain-community/src/retrievers/bm25.ts index 48d1562f27cf..a9de3d9df3f9 100644 --- a/libs/langchain-community/src/retrievers/bm25.ts +++ b/libs/langchain-community/src/retrievers/bm25.ts @@ -1,11 +1,12 @@ -import BM25 from "okapibm25"; -import { BaseRetriever } from "@langchain/core/retrievers"; +import { BaseRetriever, BaseRetrieverInput } from "@langchain/core/retrievers"; import { Document } from "@langchain/core/documents"; +import { BM25 } from "../utils/@furkantoprak/bm25/BM25.js"; + export type BM25RetrieverOptions = { docs: Document[]; k: number; -}; +} & BaseRetrieverInput; /** * A retriever that uses the BM25 algorithm to rank documents based on their @@ -31,7 +32,7 @@ export class BM25Retriever extends BaseRetriever { k: number; constructor(options: BM25RetrieverOptions) { - super(); + super(options); this.docs = options.docs; this.k = options.k; } @@ -43,8 +44,7 @@ export class BM25Retriever extends BaseRetriever { async _getRelevantDocuments(query: string) { const processedQuery = this.preprocessFunc(query); const documents = this.docs.map((doc) => doc.pageContent); - // eslint-disable-next-line @typescript-eslint/no-explicit-any - const scores = (BM25 as any).default(documents, processedQuery) as number[]; + const scores = BM25(documents, processedQuery) as number[]; const scoredDocs = this.docs.map((doc, index) => ({ document: doc, diff --git a/libs/langchain-community/src/utils/@furkantoprak/bm25/BM25.ts b/libs/langchain-community/src/utils/@furkantoprak/bm25/BM25.ts new file mode 100644 index 000000000000..e3e2621168e7 --- /dev/null +++ b/libs/langchain-community/src/utils/@furkantoprak/bm25/BM25.ts @@ -0,0 +1,100 @@ +/** + * Adapted from + * https://github.com/FurkanToprak/OkapiBM25 + * + * Inlined due to CJS import issues. + */ + +/** Gets word count. */ +export const getWordCount = (corpus: string) => { + return ((corpus || "").match(/\w+/g) || []).length; +}; + +/** Number of occurences of a word in a string. */ +export const getTermFrequency = (term: string, corpus: string) => { + return ((corpus || "").match(new RegExp(term, "g")) || []).length; +}; + +/** Inverse document frequency. */ +export const getIDF = (term: string, documents: string[]) => { + // Number of relevant documents. + const relevantDocuments = documents.filter((document: string) => + document.includes(term) + ).length; + return Math.log( + (documents.length - relevantDocuments + 0.5) / (relevantDocuments + 0.5) + 1 + ); +}; + +/** Represents a document; useful when sorting results. + */ +export interface BMDocument { + /** The document is originally scoreed. */ + document: string; + /** The score that the document recieves. */ + score: number; +} + +/** Constants that are free parameters used in BM25, specifically when generating inverse document frequency. */ +export interface BMConstants { + /** Free parameter. Is 0.75 by default. */ + b?: number; + /** Free parameter. Is 1.2 by default. Generally in range [1.2, 2.0] */ + k1?: number; +} + +/** If returns positive, the sorting results in secondEl coming before firstEl, else, firstEl comes before secondEL */ +export type BMSorter = (firstEl: BMDocument, secondEl: BMDocument) => number; + +/** Implementation of Okapi BM25 algorithm. + * @param documents: Collection of documents. + * @param keywords: query terms. + * @param constants: Contains free parameters k1 and b. b=0.75 and k1=1.2 by default. + * @param sort: A function that allows you to sort queries by a given rule. If not provided, returns results corresponding to the original order. + * If this option is provided, the return type will not be an array of scores but an array of documents with their scores. + */ +export function BM25( + documents: string[], + keywords: string[], + constants?: BMConstants, + sorter?: BMSorter +): number[] | BMDocument[] { + const b = constants && constants.b ? constants.b : 0.75; + const k1 = constants && constants.k1 ? constants.k1 : 1.2; + const documentLengths = documents.map((document: string) => + getWordCount(document) + ); + const averageDocumentLength = + documentLengths.reduce((a, b) => a + b, 0) / documents.length; + const idfByKeyword = keywords.reduce((obj, keyword) => { + obj.set(keyword, getIDF(keyword, documents)); + return obj; + }, new Map()); + + const scores = documents.map((document: string, index: number) => { + const score = keywords + .map((keyword: string) => { + const inverseDocumentFrequency = idfByKeyword.get(keyword); + if (inverseDocumentFrequency === undefined) { + throw new Error("Missing keyword."); + } + const termFrequency = getTermFrequency(keyword, document); + const documentLength = documentLengths[index]; + return ( + (inverseDocumentFrequency * (termFrequency * (k1 + 1))) / + (termFrequency + + k1 * (1 - b + (b * documentLength) / averageDocumentLength)) + ); + }) + .reduce((a: number, b: number) => a + b, 0); + if (sorter) { + return { score, document } as BMDocument; + } + return score; + }); + // sort the results + if (sorter) { + return (scores as BMDocument[]).sort(sorter); + } + return scores as number[]; +} diff --git a/libs/langchain-community/src/utils/@furkantoprak/bm25/LICENSE.md b/libs/langchain-community/src/utils/@furkantoprak/bm25/LICENSE.md new file mode 100644 index 000000000000..8dd59105dd36 --- /dev/null +++ b/libs/langchain-community/src/utils/@furkantoprak/bm25/LICENSE.md @@ -0,0 +1,21 @@ +# MIT License + +## Copyright (c) 2020 Furkan Toprak + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. From 5811ca680dbcfa20ce745ba194061ab59982c888 Mon Sep 17 00:00:00 2001 From: jacoblee93 Date: Tue, 10 Sep 2024 15:04:04 -0700 Subject: [PATCH 09/11] Fix --- libs/langchain-community/src/retrievers/bm25.ts | 4 ---- 1 file changed, 4 deletions(-) diff --git a/libs/langchain-community/src/retrievers/bm25.ts b/libs/langchain-community/src/retrievers/bm25.ts index a9de3d9df3f9..dfc04709cba1 100644 --- a/libs/langchain-community/src/retrievers/bm25.ts +++ b/libs/langchain-community/src/retrievers/bm25.ts @@ -55,8 +55,4 @@ export class BM25Retriever extends BaseRetriever { return scoredDocs.slice(0, this.k).map((item) => item.document); } - - async invoke(input: string): Promise { - return this._getRelevantDocuments(input); - } } From 1deb7b1d9f98f16cb14460e75b2f29b3efc95b2d Mon Sep 17 00:00:00 2001 From: jacoblee93 Date: Tue, 10 Sep 2024 15:08:05 -0700 Subject: [PATCH 10/11] Remove dep --- libs/langchain-community/langchain.config.js | 1 - libs/langchain-community/package.json | 5 ----- libs/langchain-community/src/load/import_constants.ts | 1 - libs/langchain-community/src/load/import_map.ts | 1 + 4 files changed, 1 insertion(+), 7 deletions(-) diff --git a/libs/langchain-community/langchain.config.js b/libs/langchain-community/langchain.config.js index 85ab7194bfb6..ea6b48d2e3a7 100644 --- a/libs/langchain-community/langchain.config.js +++ b/libs/langchain-community/langchain.config.js @@ -427,7 +427,6 @@ export const config = { "chat_models/zhipuai", "retrievers/amazon_kendra", "retrievers/amazon_knowledge_base", - "retrievers/bm25", "retrievers/dria", "retrievers/metal", "retrievers/supabase", diff --git a/libs/langchain-community/package.json b/libs/langchain-community/package.json index 2ce214216c5b..78ff8a877b6a 100644 --- a/libs/langchain-community/package.json +++ b/libs/langchain-community/package.json @@ -186,7 +186,6 @@ "node-llama-cpp": "2.7.3", "notion-to-md": "^3.1.0", "officeparser": "^4.0.4", - "okapibm25": "^1.4.1", "pdf-parse": "1.1.1", "pg": "^8.11.0", "pg-copy-streams": "^6.0.5", @@ -313,7 +312,6 @@ "node-llama-cpp": "*", "notion-to-md": "^3.1.0", "officeparser": "^4.0.4", - "okapibm25": "^1.4.1", "pdf-parse": "1.1.1", "pg": "^8.11.0", "pg-copy-streams": "^6.0.5", @@ -634,9 +632,6 @@ "officeparser": { "optional": true }, - "okapibm25": { - "optional": true - }, "pdf-parse": { "optional": true }, diff --git a/libs/langchain-community/src/load/import_constants.ts b/libs/langchain-community/src/load/import_constants.ts index af2c93648da2..25ef64e5bac1 100644 --- a/libs/langchain-community/src/load/import_constants.ts +++ b/libs/langchain-community/src/load/import_constants.ts @@ -101,7 +101,6 @@ export const optionalImportEntrypoints: string[] = [ "langchain_community/callbacks/handlers/upstash_ratelimit", "langchain_community/retrievers/amazon_kendra", "langchain_community/retrievers/amazon_knowledge_base", - "langchain_community/retrievers/bm25", "langchain_community/retrievers/dria", "langchain_community/retrievers/metal", "langchain_community/retrievers/supabase", diff --git a/libs/langchain-community/src/load/import_map.ts b/libs/langchain-community/src/load/import_map.ts index 59efd56b760e..5bbd9e4d0a01 100644 --- a/libs/langchain-community/src/load/import_map.ts +++ b/libs/langchain-community/src/load/import_map.ts @@ -54,6 +54,7 @@ export * as chat_models__moonshot from "../chat_models/moonshot.js"; export * as chat_models__ollama from "../chat_models/ollama.js"; export * as chat_models__togetherai from "../chat_models/togetherai.js"; export * as chat_models__yandex from "../chat_models/yandex.js"; +export * as retrievers__bm25 from "../retrievers/bm25.js"; export * as retrievers__chaindesk from "../retrievers/chaindesk.js"; export * as retrievers__databerry from "../retrievers/databerry.js"; export * as retrievers__remote from "../retrievers/remote/index.js"; From fd77a98ccf448dc8f64ec88272fe2b09a24fe93f Mon Sep 17 00:00:00 2001 From: jacoblee93 Date: Tue, 10 Sep 2024 15:10:17 -0700 Subject: [PATCH 11/11] Update lock --- yarn.lock | 11 ----------- 1 file changed, 11 deletions(-) diff --git a/yarn.lock b/yarn.lock index 721962918d0a..ff5f4ee61df5 100644 --- a/yarn.lock +++ b/yarn.lock @@ -11320,7 +11320,6 @@ __metadata: node-llama-cpp: 2.7.3 notion-to-md: ^3.1.0 officeparser: ^4.0.4 - okapibm25: ^1.4.1 pdf-parse: 1.1.1 pg: ^8.11.0 pg-copy-streams: ^6.0.5 @@ -11449,7 +11448,6 @@ __metadata: node-llama-cpp: "*" notion-to-md: ^3.1.0 officeparser: ^4.0.4 - okapibm25: ^1.4.1 pdf-parse: 1.1.1 pg: ^8.11.0 pg-copy-streams: ^6.0.5 @@ -11670,8 +11668,6 @@ __metadata: optional: true officeparser: optional: true - okapibm25: - optional: true pdf-parse: optional: true pg: @@ -35050,13 +35046,6 @@ __metadata: languageName: node linkType: hard -"okapibm25@npm:^1.4.1": - version: 1.4.1 - resolution: "okapibm25@npm:1.4.1" - checksum: f619b8888fa983861116f2fd4f2896411d157a75c5ae79d81fd9322821d274f127e0199e0f0b93961c49b6f31b710723ffd893a77820bdd0e52c859c2cbcb594 - languageName: node - linkType: hard - "ollama@npm:^0.5.6": version: 0.5.6 resolution: "ollama@npm:0.5.6"