From 64226da5a578757f2806c3e0235d812a6866ff14 Mon Sep 17 00:00:00 2001 From: nick-w-nick <43578531+nick-w-nick@users.noreply.github.com> Date: Fri, 16 Aug 2024 05:39:19 -0400 Subject: [PATCH] pinecone[patch]: Fix document ID not getting set when returned from PineconeStore (#6539) * added _formatMatches privatemethod * updated similaritySearchVectorWithScore to use _formatMatches * updated maxMarginalRelevanceSearch to use _formatMatches * Add integration test --------- Co-authored-by: jacoblee93 --- .../src/tests/vectorstores.int.test.ts | 60 ++++++++++++----- libs/langchain-pinecone/src/vectorstores.ts | 66 ++++++++++++------- 2 files changed, 83 insertions(+), 43 deletions(-) diff --git a/libs/langchain-pinecone/src/tests/vectorstores.int.test.ts b/libs/langchain-pinecone/src/tests/vectorstores.int.test.ts index b4cb28c50b05..074dccae9004 100644 --- a/libs/langchain-pinecone/src/tests/vectorstores.int.test.ts +++ b/libs/langchain-pinecone/src/tests/vectorstores.int.test.ts @@ -1,6 +1,7 @@ /* eslint-disable no-process-env */ /* eslint-disable @typescript-eslint/no-non-null-assertion */ /* eslint-disable no-promise-executor-return */ +/* eslint-disable @typescript-eslint/no-explicit-any */ import { describe, expect, test } from "@jest/globals"; import { faker } from "@faker-js/faker"; import { Pinecone } from "@pinecone-database/pinecone"; @@ -9,12 +10,14 @@ import { SyntheticEmbeddings } from "@langchain/core/utils/testing"; import { Document } from "@langchain/core/documents"; import { PineconeStoreParams, PineconeStore } from "../vectorstores.js"; +const PINECONE_SLEEP_LENGTH = 40000; + function sleep(ms: number) { // eslint-disable-next-line no-promise-executor-return return new Promise((resolve) => setTimeout(resolve, ms)); } -describe.skip("PineconeStore", () => { +describe("PineconeStore", () => { let pineconeStore: PineconeStore; const testIndexName = process.env.PINECONE_INDEX!; let namespaces: string[] = []; @@ -57,22 +60,29 @@ describe.skip("PineconeStore", () => { [{ pageContent, metadata: {} }], [documentId] ); - await sleep(35000); + + await sleep(PINECONE_SLEEP_LENGTH); const results = await pineconeStore.similaritySearch(pageContent, 1); - expect(results).toEqual([new Document({ metadata: {}, pageContent })]); + expect(results).toEqual([ + new Document({ metadata: {}, pageContent, id: documentId }), + ]); await pineconeStore.addDocuments( [{ pageContent: `${pageContent} upserted`, metadata: {} }], [documentId] ); - await sleep(35000); + await sleep(PINECONE_SLEEP_LENGTH); const results2 = await pineconeStore.similaritySearch(pageContent, 1); expect(results2).toEqual([ - new Document({ metadata: {}, pageContent: `${pageContent} upserted` }), + new Document({ + metadata: {}, + pageContent: `${pageContent} upserted`, + id: documentId, + }), ]); }); @@ -83,11 +93,15 @@ describe.skip("PineconeStore", () => { { pageContent, metadata: { foo: "bar" } }, ]); - await sleep(35000); + await sleep(PINECONE_SLEEP_LENGTH); const results = await pineconeStore.similaritySearch(pageContent, 1); expect(results).toEqual([ - new Document({ metadata: { foo: "bar" }, pageContent }), + new Document({ + metadata: { foo: "bar" }, + pageContent, + id: expect.any(String) as any, + }), ]); }); @@ -100,14 +114,18 @@ describe.skip("PineconeStore", () => { { pageContent, metadata: { foo: id } }, { pageContent, metadata: { foo: "qux" } }, ]); - await sleep(35000); + await sleep(PINECONE_SLEEP_LENGTH); // If the filter wasn't working, we'd get all 3 documents back const results = await pineconeStore.similaritySearch(pageContent, 3, { foo: id, }); expect(results).toEqual([ - new Document({ metadata: { foo: id }, pageContent }), + new Document({ + metadata: { foo: id }, + pageContent, + id: expect.any(String) as any, + }), ]); }); @@ -120,7 +138,7 @@ describe.skip("PineconeStore", () => { { pageContent, metadata: { foo: id } }, { pageContent, metadata: { foo: id } }, ]); - await sleep(35000); + await sleep(PINECONE_SLEEP_LENGTH); // If the filter wasn't working, we'd get all 3 documents back const results = await pineconeStore.maxMarginalRelevanceSearch( pageContent, @@ -142,7 +160,7 @@ describe.skip("PineconeStore", () => { { pageContent, metadata: { foo: id } }, { pageContent, metadata: { foo: id } }, ]); - await sleep(35000); + await sleep(PINECONE_SLEEP_LENGTH); const results = await pineconeStore.similaritySearch(pageContent, 2, { foo: id, }); @@ -174,7 +192,7 @@ describe.skip("PineconeStore", () => { ids: [id, id2], } ); - await sleep(40000); + await sleep(PINECONE_SLEEP_LENGTH); const indexStats = await pineconeStore.pineconeIndex.describeIndexStats(); expect(indexStats.namespaces).toHaveProperty(""); expect(indexStats.namespaces?.[""].recordCount).toEqual(2); @@ -184,7 +202,7 @@ describe.skip("PineconeStore", () => { await pineconeStore.delete({ deleteAll: true, }); - await sleep(40000); + await sleep(PINECONE_SLEEP_LENGTH); const indexStats2 = await pineconeStore.pineconeIndex.describeIndexStats(); expect(indexStats2.namespaces).not.toHaveProperty(""); // The new total records should be less than the previous total records @@ -209,7 +227,7 @@ describe.skip("PineconeStore", () => { namespace: namespaces[1], } ); - await sleep(35000); + await sleep(PINECONE_SLEEP_LENGTH); const results = await pineconeStore.similaritySearch(pageContent, 1, { namespace: namespaces[0], }); @@ -234,22 +252,28 @@ describe.skip("PineconeStore", () => { }); await store.addDocuments([{ pageContent, metadata: {} }], [documentId]); - await sleep(35000); + await sleep(PINECONE_SLEEP_LENGTH); const results = await store.similaritySearch(pageContent, 1); - expect(results).toEqual([new Document({ metadata: {}, pageContent })]); + expect(results).toEqual([ + new Document({ metadata: {}, pageContent, id: documentId }), + ]); await store.addDocuments( [{ pageContent: `${pageContent} upserted`, metadata: {} }], [documentId] ); - await sleep(35000); + await sleep(PINECONE_SLEEP_LENGTH); const results2 = await store.similaritySearch(pageContent, 1); expect(results2).toEqual([ - new Document({ metadata: {}, pageContent: `${pageContent} upserted` }), + new Document({ + metadata: {}, + pageContent: `${pageContent} upserted`, + id: documentId, + }), ]); }); }); diff --git a/libs/langchain-pinecone/src/vectorstores.ts b/libs/langchain-pinecone/src/vectorstores.ts index 3a99dbceb057..2228d2456744 100644 --- a/libs/langchain-pinecone/src/vectorstores.ts +++ b/libs/langchain-pinecone/src/vectorstores.ts @@ -5,6 +5,7 @@ import { RecordMetadata, PineconeRecord, Index as PineconeIndex, + ScoredPineconeRecord, } from "@pinecone-database/pinecone"; import type { EmbeddingsInterface } from "@langchain/core/embeddings"; @@ -401,6 +402,40 @@ export class PineconeStore extends VectorStore { return results; } + /** + * Format the matching results from the Pinecone query. + * @param matches Matching results from the Pinecone query. + * @returns An array of arrays, where each inner array contains a document and its score. + */ + private _formatMatches( + matches: ScoredPineconeRecord[] = [] + ): [Document, number][] { + const documentsWithScores: [Document, number][] = []; + + for (const record of matches) { + const { + id, + score, + metadata: { [this.textKey]: pageContent, ...metadata } = { + [this.textKey]: "", + }, + } = record; + + if (score) { + documentsWithScores.push([ + new Document({ + id, + pageContent: pageContent.toString(), + metadata, + }), + score, + ]); + } + } + + return documentsWithScores; + } + /** * Method that performs a similarity search in the Pinecone database and * returns the results along with their scores. @@ -414,20 +449,10 @@ export class PineconeStore extends VectorStore { k: number, filter?: PineconeMetadata ): Promise<[Document, number][]> { - const results = await this._runPineconeQuery(query, k, filter); - const result: [Document, number][] = []; - - if (results.matches) { - for (const res of results.matches) { - const { [this.textKey]: pageContent, ...metadata } = (res.metadata ?? - {}) as PineconeMetadata; - if (res.score) { - result.push([new Document({ metadata, pageContent }), res.score]); - } - } - } + const { matches = [] } = await this._runPineconeQuery(query, k, filter); + const records = this._formatMatches(matches); - return result; + return records; } /** @@ -457,7 +482,7 @@ export class PineconeStore extends VectorStore { { includeValues: true } ); - const matches = results?.matches ?? []; + const { matches = [] } = results; const embeddingList = matches.map((match) => match.values); const mmrIndexes = maximalMarginalRelevance( @@ -468,17 +493,8 @@ export class PineconeStore extends VectorStore { ); const topMmrMatches = mmrIndexes.map((idx) => matches[idx]); - - const finalResult: Document[] = []; - for (const res of topMmrMatches) { - const { [this.textKey]: pageContent, ...metadata } = (res.metadata ?? - {}) as PineconeMetadata; - if (res.score) { - finalResult.push(new Document({ metadata, pageContent })); - } - } - - return finalResult; + const records = this._formatMatches(topMmrMatches); + return records.map(([doc, _score]) => doc); } /**