diff --git a/docs/core_docs/docs/integrations/retrievers/bm25.ipynb b/docs/core_docs/docs/integrations/retrievers/bm25.ipynb
new file mode 100644
index 000000000000..b106554c6b40
--- /dev/null
+++ b/docs/core_docs/docs/integrations/retrievers/bm25.ipynb
@@ -0,0 +1,101 @@
+{
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# BM25\n",
+ "\n",
+ "BM25, also known as [Okapi BM25](https://en.wikipedia.org/wiki/Okapi_BM25), is a ranking function used in information retrieval systems to estimate the relevance of documents to a given search query.\n",
+ "\n",
+ "You can use it as part of your retrieval pipeline as a to rerank documents as a postprocessing step after retrieving an initial set of documents from another source.\n",
+ "\n",
+ "## Setup\n",
+ "\n",
+ "The `BM25Retriever` is exported from `@langchain/community`. You'll need to install it like this:\n",
+ "\n",
+ "```{=mdx}\n",
+ "import IntegrationInstallTooltip from \"@mdx_components/integration_install_tooltip.mdx\";\n",
+ "import Npm2Yarn from \"@theme/Npm2Yarn\";\n",
+ "\n",
+ "\n",
+ "\n",
+ "\n",
+ " @langchain/community @langchain/core\n",
+ "\n",
+ "```\n",
+ "\n",
+ "This retriever uses code from [`this implementation`](https://github.com/FurkanToprak/OkapiBM25) of Okapi BM25.\n",
+ "\n",
+ "## Usage\n",
+ "\n",
+ "You can now create a new retriever with previously retrieved documents:"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 1,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "[\n",
+ " { pageContent: 'mitochondria is made of lipids', metadata: {} },\n",
+ " {\n",
+ " pageContent: 'mitochondria is the powerhouse of the cell',\n",
+ " metadata: {}\n",
+ " },\n",
+ " { pageContent: 'Buildings are made out of brick', metadata: {} },\n",
+ " { pageContent: 'Buildings are made out of wood', metadata: {} }\n",
+ "]\n"
+ ]
+ }
+ ],
+ "source": [
+ "import { BM25Retriever } from \"@langchain/community/retrievers/bm25\";\n",
+ "\n",
+ "const retriever = BM25Retriever.fromDocuments([\n",
+ " { pageContent: \"Buildings are made out of brick\", metadata: {} },\n",
+ " { pageContent: \"Buildings are made out of wood\", metadata: {} },\n",
+ " { pageContent: \"Buildings are made out of stone\", metadata: {} },\n",
+ " { pageContent: \"Cars are made out of metal\", metadata: {} },\n",
+ " { pageContent: \"Cars are made out of plastic\", metadata: {} },\n",
+ " { pageContent: \"mitochondria is the powerhouse of the cell\", metadata: {} },\n",
+ " { pageContent: \"mitochondria is made of lipids\", metadata: {} },\n",
+ "], { k: 4 });\n",
+ "\n",
+ "// Will return the 4 documents reranked by the BM25 algorithm\n",
+ "await retriever.invoke(\"mitochondria\");"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": []
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "TypeScript",
+ "language": "typescript",
+ "name": "tslab"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "mode": "typescript",
+ "name": "javascript",
+ "typescript": true
+ },
+ "file_extension": ".ts",
+ "mimetype": "text/typescript",
+ "name": "typescript",
+ "version": "3.7.2"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/libs/langchain-community/.gitignore b/libs/langchain-community/.gitignore
index 8f6f452a7468..1963c98f29be 100644
--- a/libs/langchain-community/.gitignore
+++ b/libs/langchain-community/.gitignore
@@ -618,6 +618,10 @@ retrievers/amazon_knowledge_base.cjs
retrievers/amazon_knowledge_base.js
retrievers/amazon_knowledge_base.d.ts
retrievers/amazon_knowledge_base.d.cts
+retrievers/bm25.cjs
+retrievers/bm25.js
+retrievers/bm25.d.ts
+retrievers/bm25.d.cts
retrievers/chaindesk.cjs
retrievers/chaindesk.js
retrievers/chaindesk.d.ts
diff --git a/libs/langchain-community/langchain.config.js b/libs/langchain-community/langchain.config.js
index 3709f58812e7..ea6b48d2e3a7 100644
--- a/libs/langchain-community/langchain.config.js
+++ b/libs/langchain-community/langchain.config.js
@@ -194,6 +194,7 @@ export const config = {
// retrievers
"retrievers/amazon_kendra": "retrievers/amazon_kendra",
"retrievers/amazon_knowledge_base": "retrievers/amazon_knowledge_base",
+ "retrievers/bm25": "retrievers/bm25",
"retrievers/chaindesk": "retrievers/chaindesk",
"retrievers/databerry": "retrievers/databerry",
"retrievers/dria": "retrievers/dria",
diff --git a/libs/langchain-community/package.json b/libs/langchain-community/package.json
index 988f307255d0..78ff8a877b6a 100644
--- a/libs/langchain-community/package.json
+++ b/libs/langchain-community/package.json
@@ -2095,6 +2095,15 @@
"import": "./retrievers/amazon_knowledge_base.js",
"require": "./retrievers/amazon_knowledge_base.cjs"
},
+ "./retrievers/bm25": {
+ "types": {
+ "import": "./retrievers/bm25.d.ts",
+ "require": "./retrievers/bm25.d.cts",
+ "default": "./retrievers/bm25.d.ts"
+ },
+ "import": "./retrievers/bm25.js",
+ "require": "./retrievers/bm25.cjs"
+ },
"./retrievers/chaindesk": {
"types": {
"import": "./retrievers/chaindesk.d.ts",
@@ -3682,6 +3691,10 @@
"retrievers/amazon_knowledge_base.js",
"retrievers/amazon_knowledge_base.d.ts",
"retrievers/amazon_knowledge_base.d.cts",
+ "retrievers/bm25.cjs",
+ "retrievers/bm25.js",
+ "retrievers/bm25.d.ts",
+ "retrievers/bm25.d.cts",
"retrievers/chaindesk.cjs",
"retrievers/chaindesk.js",
"retrievers/chaindesk.d.ts",
diff --git a/libs/langchain-community/src/load/import_map.ts b/libs/langchain-community/src/load/import_map.ts
index 59efd56b760e..5bbd9e4d0a01 100644
--- a/libs/langchain-community/src/load/import_map.ts
+++ b/libs/langchain-community/src/load/import_map.ts
@@ -54,6 +54,7 @@ export * as chat_models__moonshot from "../chat_models/moonshot.js";
export * as chat_models__ollama from "../chat_models/ollama.js";
export * as chat_models__togetherai from "../chat_models/togetherai.js";
export * as chat_models__yandex from "../chat_models/yandex.js";
+export * as retrievers__bm25 from "../retrievers/bm25.js";
export * as retrievers__chaindesk from "../retrievers/chaindesk.js";
export * as retrievers__databerry from "../retrievers/databerry.js";
export * as retrievers__remote from "../retrievers/remote/index.js";
diff --git a/libs/langchain-community/src/retrievers/bm25.ts b/libs/langchain-community/src/retrievers/bm25.ts
new file mode 100644
index 000000000000..dfc04709cba1
--- /dev/null
+++ b/libs/langchain-community/src/retrievers/bm25.ts
@@ -0,0 +1,58 @@
+import { BaseRetriever, BaseRetrieverInput } from "@langchain/core/retrievers";
+import { Document } from "@langchain/core/documents";
+
+import { BM25 } from "../utils/@furkantoprak/bm25/BM25.js";
+
+export type BM25RetrieverOptions = {
+ docs: Document[];
+ k: number;
+} & BaseRetrieverInput;
+
+/**
+ * A retriever that uses the BM25 algorithm to rank documents based on their
+ * similarity to a query. It uses the "okapibm25" package for BM25 scoring.
+ * The k parameter determines the number of documents to return for each query.
+ */
+export class BM25Retriever extends BaseRetriever {
+ static lc_name() {
+ return "BM25Retriever";
+ }
+
+ lc_namespace = ["langchain", "retrievers", "bm25_retriever"];
+
+ static fromDocuments(
+ documents: Document[],
+ options: Omit
+ ) {
+ return new this({ ...options, docs: documents });
+ }
+
+ docs: Document[];
+
+ k: number;
+
+ constructor(options: BM25RetrieverOptions) {
+ super(options);
+ this.docs = options.docs;
+ this.k = options.k;
+ }
+
+ private preprocessFunc(text: string): string[] {
+ return text.toLowerCase().split(/\s+/);
+ }
+
+ async _getRelevantDocuments(query: string) {
+ const processedQuery = this.preprocessFunc(query);
+ const documents = this.docs.map((doc) => doc.pageContent);
+ const scores = BM25(documents, processedQuery) as number[];
+
+ const scoredDocs = this.docs.map((doc, index) => ({
+ document: doc,
+ score: scores[index],
+ }));
+
+ scoredDocs.sort((a, b) => b.score - a.score);
+
+ return scoredDocs.slice(0, this.k).map((item) => item.document);
+ }
+}
diff --git a/libs/langchain-community/src/retrievers/tests/bm25.test.ts b/libs/langchain-community/src/retrievers/tests/bm25.test.ts
new file mode 100644
index 000000000000..bcfe46f940b8
--- /dev/null
+++ b/libs/langchain-community/src/retrievers/tests/bm25.test.ts
@@ -0,0 +1,27 @@
+import { expect, test } from "@jest/globals";
+import { Document } from "@langchain/core/documents";
+import { BM25Retriever } from "../bm25.js";
+
+test("BM25Retriever", async () => {
+ const docs = [
+ new Document({
+ pageContent: "The quick brown fox jumps over the lazy dog",
+ }),
+ new Document({
+ pageContent: "A lazy dog sleeps all day",
+ }),
+ new Document({
+ pageContent: "The brown fox is quick and clever",
+ }),
+ ];
+
+ const retriever = BM25Retriever.fromDocuments(docs, {
+ k: 2,
+ });
+ const results = await retriever.invoke("the fox and the dog");
+
+ expect(results).toHaveLength(2);
+ expect(results[0].pageContent).toBe(
+ "The quick brown fox jumps over the lazy dog"
+ );
+});
diff --git a/libs/langchain-community/src/utils/@furkantoprak/bm25/BM25.ts b/libs/langchain-community/src/utils/@furkantoprak/bm25/BM25.ts
new file mode 100644
index 000000000000..e3e2621168e7
--- /dev/null
+++ b/libs/langchain-community/src/utils/@furkantoprak/bm25/BM25.ts
@@ -0,0 +1,100 @@
+/**
+ * Adapted from
+ * https://github.com/FurkanToprak/OkapiBM25
+ *
+ * Inlined due to CJS import issues.
+ */
+
+/** Gets word count. */
+export const getWordCount = (corpus: string) => {
+ return ((corpus || "").match(/\w+/g) || []).length;
+};
+
+/** Number of occurences of a word in a string. */
+export const getTermFrequency = (term: string, corpus: string) => {
+ return ((corpus || "").match(new RegExp(term, "g")) || []).length;
+};
+
+/** Inverse document frequency. */
+export const getIDF = (term: string, documents: string[]) => {
+ // Number of relevant documents.
+ const relevantDocuments = documents.filter((document: string) =>
+ document.includes(term)
+ ).length;
+ return Math.log(
+ (documents.length - relevantDocuments + 0.5) / (relevantDocuments + 0.5) + 1
+ );
+};
+
+/** Represents a document; useful when sorting results.
+ */
+export interface BMDocument {
+ /** The document is originally scoreed. */
+ document: string;
+ /** The score that the document recieves. */
+ score: number;
+}
+
+/** Constants that are free parameters used in BM25, specifically when generating inverse document frequency. */
+export interface BMConstants {
+ /** Free parameter. Is 0.75 by default. */
+ b?: number;
+ /** Free parameter. Is 1.2 by default. Generally in range [1.2, 2.0] */
+ k1?: number;
+}
+
+/** If returns positive, the sorting results in secondEl coming before firstEl, else, firstEl comes before secondEL */
+export type BMSorter = (firstEl: BMDocument, secondEl: BMDocument) => number;
+
+/** Implementation of Okapi BM25 algorithm.
+ * @param documents: Collection of documents.
+ * @param keywords: query terms.
+ * @param constants: Contains free parameters k1 and b. b=0.75 and k1=1.2 by default.
+ * @param sort: A function that allows you to sort queries by a given rule. If not provided, returns results corresponding to the original order.
+ * If this option is provided, the return type will not be an array of scores but an array of documents with their scores.
+ */
+export function BM25(
+ documents: string[],
+ keywords: string[],
+ constants?: BMConstants,
+ sorter?: BMSorter
+): number[] | BMDocument[] {
+ const b = constants && constants.b ? constants.b : 0.75;
+ const k1 = constants && constants.k1 ? constants.k1 : 1.2;
+ const documentLengths = documents.map((document: string) =>
+ getWordCount(document)
+ );
+ const averageDocumentLength =
+ documentLengths.reduce((a, b) => a + b, 0) / documents.length;
+ const idfByKeyword = keywords.reduce((obj, keyword) => {
+ obj.set(keyword, getIDF(keyword, documents));
+ return obj;
+ }, new Map());
+
+ const scores = documents.map((document: string, index: number) => {
+ const score = keywords
+ .map((keyword: string) => {
+ const inverseDocumentFrequency = idfByKeyword.get(keyword);
+ if (inverseDocumentFrequency === undefined) {
+ throw new Error("Missing keyword.");
+ }
+ const termFrequency = getTermFrequency(keyword, document);
+ const documentLength = documentLengths[index];
+ return (
+ (inverseDocumentFrequency * (termFrequency * (k1 + 1))) /
+ (termFrequency +
+ k1 * (1 - b + (b * documentLength) / averageDocumentLength))
+ );
+ })
+ .reduce((a: number, b: number) => a + b, 0);
+ if (sorter) {
+ return { score, document } as BMDocument;
+ }
+ return score;
+ });
+ // sort the results
+ if (sorter) {
+ return (scores as BMDocument[]).sort(sorter);
+ }
+ return scores as number[];
+}
diff --git a/libs/langchain-community/src/utils/@furkantoprak/bm25/LICENSE.md b/libs/langchain-community/src/utils/@furkantoprak/bm25/LICENSE.md
new file mode 100644
index 000000000000..8dd59105dd36
--- /dev/null
+++ b/libs/langchain-community/src/utils/@furkantoprak/bm25/LICENSE.md
@@ -0,0 +1,21 @@
+# MIT License
+
+## Copyright (c) 2020 Furkan Toprak
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.