Skip to content

Commit

Permalink
Filter null documents in filesystem loaders (langchain-ai#849)
Browse files Browse the repository at this point in the history
* Filter null documents in filesystem loaders

* Fix tsc errors
  • Loading branch information
nfcampos authored and RohitMidha23 committed Apr 18, 2023
1 parent 71d0be5 commit 500f51d
Show file tree
Hide file tree
Showing 9 changed files with 38 additions and 14 deletions.
1 change: 1 addition & 0 deletions langchain/.eslintrc.cjs
Original file line number Diff line number Diff line change
Expand Up @@ -71,6 +71,7 @@ module.exports = {
"no-console": 0,
"no-restricted-syntax": 0,
"no-shadow": 0,
"no-continue": 0,
"no-underscore-dangle": 0,
"no-use-before-define": 0,
"no-useless-constructor": 0,
Expand Down
8 changes: 4 additions & 4 deletions langchain/src/document.ts
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ export interface DocumentParams<
> {
pageContent: string;

metadata: Metadata;
metadata?: Metadata;
}

/**
Expand All @@ -19,10 +19,10 @@ export class Document<

metadata: Metadata;

constructor(fields?: Partial<DocumentParams<Metadata>>) {
this.pageContent = fields?.pageContent
constructor(fields: DocumentParams<Metadata>) {
this.pageContent = fields.pageContent
? fields.pageContent.toString()
: this.pageContent;
this.metadata = fields?.metadata ?? ({} as Metadata);
this.metadata = fields.metadata ?? ({} as Metadata);
}
}
3 changes: 3 additions & 0 deletions langchain/src/document_loaders/fs/docx.ts
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,9 @@ export class DocxLoader extends BufferLoader {
const docx = await extractRawText({
buffer: raw,
});

if (!docx.value) return [];

return [
new Document({
pageContent: docx.value,
Expand Down
7 changes: 6 additions & 1 deletion langchain/src/document_loaders/fs/epub.ts
Original file line number Diff line number Diff line change
Expand Up @@ -17,8 +17,10 @@ export class EPubLoader extends BaseDocumentLoader {
const chapters = await Promise.all(
epub.flow.map(async (chapter) => {
if (!chapter.id) return null as never;
const html: string = await epub.getChapterRawAsync(chapter.id);
if (!html) return null as never;
return {
html: await epub.getChapterRawAsync(chapter.id),
html,
title: chapter.title,
};
})
Expand All @@ -37,6 +39,9 @@ export class EPubLoader extends BaseDocumentLoader {

const parsed = await this.parse(epub);
const metadata = { source: this.filePath };

if (parsed.length === 0) return [];

return this.splitChapters
? parsed.map(
(chapter) =>
Expand Down
9 changes: 9 additions & 0 deletions langchain/src/document_loaders/fs/pdf.ts
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,11 @@ export class PDFLoader extends BufferLoader {
for (let i = 1; i <= pdf.numPages; i += 1) {
const page = await pdf.getPage(i);
const content = await page.getTextContent();

if (content.items.length === 0) {
continue;
}

const text = content.items
.map((item) => (item as TextItem).str)
.join("\n");
Expand Down Expand Up @@ -61,6 +66,10 @@ export class PDFLoader extends BufferLoader {
return documents;
}

if (documents.length === 0) {
return [];
}

return [
new Document({
pageContent: documents.map((doc) => doc.pageContent).join("\n\n"),
Expand Down
7 changes: 6 additions & 1 deletion langchain/src/document_loaders/fs/srt.ts
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,12 @@ export class SRTLoader extends TextLoader {
const { SRTParser2 } = await SRTLoaderImports();
const parser = new SRTParser2();
const srts = parser.fromSrt(raw);
return [srts.map((srt) => srt.text).join(" ")];
return [
srts
.map((srt) => srt.text)
.filter(Boolean)
.join(" "),
];
}
}

Expand Down
2 changes: 1 addition & 1 deletion langchain/src/document_loaders/fs/unstructured.ts
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,7 @@ export class UnstructuredLoader extends BaseDocumentLoader {
`Expected partitioning request to return an array, but got ${elements}`
);
}
return elements as Element[];
return elements.filter((el) => typeof el.text === "string") as Element[];
}

async load(): Promise<Document[]> {
Expand Down
11 changes: 6 additions & 5 deletions langchain/src/vectorstores/milvus.ts
Original file line number Diff line number Diff line change
Expand Up @@ -230,20 +230,21 @@ export class Milvus extends VectorStore {
}
const results: [Document, number][] = [];
searchResp.results.forEach((result) => {
const doc = new Document();
// eslint-disable-next-line @typescript-eslint/no-explicit-any
const fields = { pageContent: "", metadata: {} as Record<string, any> };
Object.keys(result).forEach((key) => {
if (key === this.textField) {
doc.pageContent = result[key];
fields.pageContent = result[key];
} else if (this.fields.includes(key)) {
if (typeof result[key] === "string") {
const { isJson, obj } = checkJsonString(result[key]);
doc.metadata[key] = isJson ? obj : result[key];
fields.metadata[key] = isJson ? obj : result[key];
} else {
doc.metadata[key] = result[key];
fields.metadata[key] = result[key];
}
}
});
results.push([doc, result.score]);
results.push([new Document(fields), result.score]);
});
// console.log("Search result: " + JSON.stringify(results, null, 2));
return results;
Expand Down
4 changes: 2 additions & 2 deletions langchain/src/vectorstores/prisma.ts
Original file line number Diff line number Diff line change
Expand Up @@ -262,10 +262,10 @@ export class PrismaVectorStore<
const results: [Document<SimilarityModel<TModel, TSelectModel>>, number][] =
[];
for (const article of articles) {
if (article._distance != null) {
if (article._distance != null && article[this.contentColumn] != null) {
results.push([
new Document({
pageContent: article[this.contentColumn] as string | undefined,
pageContent: article[this.contentColumn] as string,
metadata: article,
}),
article._distance,
Expand Down

0 comments on commit 500f51d

Please sign in to comment.