Skip to content

Commit

Permalink
✨ feat: Add configurable PDF processing method with Unstructured (lob…
Browse files Browse the repository at this point in the history
…ehub#5927)

* ✨ feat: Add configurable PDF processing method with Unstructured

* 🔧 fix: Update import path for env utility in ContentChunk module

* feat: add USE_UNSTRUCTURED_FOR_PDF environment variable to knowledge config

* Delete src/server/utils/env.ts

* feat: implement ChunkingRuleParser for file type and service mapping

* refactor: remove USE_UNSTRUCTURED_FOR_PDF from knowledge environment configuration

* test: add unit tests for ChunkingRuleParser functionality

* refactor: remove isUsingUnstructured method from ContentChunk class

* refactor: update ChunkingService type and clean up ContentChunk rules

* refactor: simplify ChunkingRuleParser and update ContentChunk module

* refactor: update ContentChunk module import for ChunkingService
  • Loading branch information
fzlzjerry authored Feb 15, 2025
1 parent b61cec7 commit 35fa3ee
Show file tree
Hide file tree
Showing 4 changed files with 162 additions and 22 deletions.
30 changes: 14 additions & 16 deletions src/config/knowledge.ts
Original file line number Diff line number Diff line change
@@ -1,19 +1,17 @@
import { createEnv } from '@t3-oss/env-nextjs';
import { z } from 'zod';

export const getKnowledgeConfig = () => {
return createEnv({
runtimeEnv: {
DEFAULT_FILES_CONFIG: process.env.DEFAULT_FILES_CONFIG,
UNSTRUCTURED_API_KEY: process.env.UNSTRUCTURED_API_KEY,
UNSTRUCTURED_SERVER_URL: process.env.UNSTRUCTURED_SERVER_URL,
},
server: {
DEFAULT_FILES_CONFIG: z.string().optional(),
UNSTRUCTURED_API_KEY: z.string().optional(),
UNSTRUCTURED_SERVER_URL: z.string().optional(),
},
});
};

export const knowledgeEnv = getKnowledgeConfig();
export const knowledgeEnv = createEnv({
runtimeEnv: {
DEFAULT_FILES_CONFIG: process.env.DEFAULT_FILES_CONFIG,
FILE_TYPE_CHUNKING_RULES: process.env.FILE_TYPE_CHUNKING_RULES,
UNSTRUCTURED_API_KEY: process.env.UNSTRUCTURED_API_KEY,
UNSTRUCTURED_SERVER_URL: process.env.UNSTRUCTURED_SERVER_URL,
},
server: {
DEFAULT_FILES_CONFIG: z.string().optional(),
FILE_TYPE_CHUNKING_RULES: z.string().optional(),
UNSTRUCTURED_API_KEY: z.string().optional(),
UNSTRUCTURED_SERVER_URL: z.string().optional(),
},
});
50 changes: 44 additions & 6 deletions src/server/modules/ContentChunk/index.ts
Original file line number Diff line number Diff line change
@@ -1,9 +1,13 @@
import { ChunkingLoader } from 'src/libs/langchain';
import { Strategy } from 'unstructured-client/sdk/models/shared';

import { NewChunkItem, NewUnstructuredChunkItem } from '@/database/schemas';
import { knowledgeEnv } from '@/config/knowledge';
import type { NewChunkItem, NewUnstructuredChunkItem } from '@/database/schemas';
import { ChunkingStrategy, Unstructured } from '@/libs/unstructured';

import { ChunkingRuleParser } from './rules';
import type { ChunkingService } from './rules';

export interface ChunkContentParams {
content: Uint8Array;
fileType: string;
Expand All @@ -19,23 +23,57 @@ interface ChunkResult {
export class ContentChunk {
private unstructuredClient: Unstructured;
private langchainClient: ChunkingLoader;
private chunkingRules: Record<string, ChunkingService[]>;

constructor() {
this.unstructuredClient = new Unstructured();
this.langchainClient = new ChunkingLoader();
this.chunkingRules = ChunkingRuleParser.parse(knowledgeEnv.FILE_TYPE_CHUNKING_RULES || '');
}

isUsingUnstructured(params: ChunkContentParams) {
return params.fileType === 'application/pdf' && params.mode === 'hi-res';
private getChunkingServices(fileType: string): ChunkingService[] {
const ext = fileType.split('/').pop()?.toLowerCase() || '';
return this.chunkingRules[ext] || ['default'];
}

async chunkContent(params: ChunkContentParams): Promise<ChunkResult> {
if (this.isUsingUnstructured(params))
return await this.chunkByUnstructured(params.filename, params.content);

const services = this.getChunkingServices(params.fileType);

for (const service of services) {
try {
switch (service) {
case 'unstructured': {
if (this.canUseUnstructured()) {
return await this.chunkByUnstructured(params.filename, params.content);
}
break;
}

case 'doc2x': {
// Future implementation
break;
}

default: {
return await this.chunkByLangChain(params.filename, params.content);
}
}
} catch (error) {
// If this is the last service, throw the error
if (service === services.at(-1)) throw error;
// Otherwise continue to next service
console.error(`Chunking failed with service ${service}:`, error);
}
}

// Fallback to langchain if no service succeeded
return await this.chunkByLangChain(params.filename, params.content);
}

private canUseUnstructured(): boolean {
return !!(knowledgeEnv.UNSTRUCTURED_API_KEY && knowledgeEnv.UNSTRUCTURED_SERVER_URL);
}

private chunkByUnstructured = async (
filename: string,
content: Uint8Array,
Expand Down
81 changes: 81 additions & 0 deletions src/server/modules/ContentChunk/rules.test.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,81 @@
import { describe, expect, it } from 'vitest';
import { ChunkingRuleParser } from './rules';

describe('ChunkingRuleParser', () => {
describe('parse', () => {
it('should parse a single file type rule correctly', () => {
const input = 'pdf=unstructured,default';
const result = ChunkingRuleParser.parse(input);

expect(result).toEqual({
pdf: ['unstructured', 'default'],
});
});

it('should parse multiple file type rules correctly', () => {
const input = 'pdf=unstructured,default;doc=doc2x,default;txt=default';
const result = ChunkingRuleParser.parse(input);

expect(result).toEqual({
pdf: ['unstructured', 'default'],
doc: ['doc2x', 'default'],
txt: ['default'],
});
});

it('should convert file types to lowercase', () => {
const input = 'PDF=unstructured;DOC=doc2x';
const result = ChunkingRuleParser.parse(input);

expect(result).toEqual({
pdf: ['unstructured'],
doc: ['doc2x'],
});
});

it('should filter out invalid service names', () => {
const input = 'pdf=unstructured,invalid,default,wrongservice';
const result = ChunkingRuleParser.parse(input);

expect(result).toEqual({
pdf: ['unstructured', 'default'],
});
});

it('should handle empty string input', () => {
const input = '';
const result = ChunkingRuleParser.parse(input);

expect(result).toEqual({});
});

it('should skip invalid rule formats', () => {
const input = 'pdf=unstructured;invalid;doc=doc2x;=default;txt';
const result = ChunkingRuleParser.parse(input);

expect(result).toEqual({
pdf: ['unstructured'],
doc: ['doc2x'],
});
});

it('should handle whitespace in service names', () => {
const input = 'pdf= unstructured , default ;doc=doc2x';
const result = ChunkingRuleParser.parse(input);

expect(result).toEqual({
pdf: ['unstructured', 'default'],
doc: ['doc2x'],
});
});

it('should handle duplicate services for same file type', () => {
const input = 'pdf=unstructured,default,unstructured';
const result = ChunkingRuleParser.parse(input);

expect(result).toEqual({
pdf: ['unstructured', 'default', 'unstructured'],
});
});
});
});
23 changes: 23 additions & 0 deletions src/server/modules/ContentChunk/rules.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
export type ChunkingService = 'unstructured' | 'doc2x' | 'default';

export const ChunkingRuleParser = {
parse(rulesStr: string): Record<string, ChunkingService[]> {
const rules: Record<string, ChunkingService[]> = {};

// Split by semicolon for different file types
const fileTypeRules = rulesStr.split(';');

for (const rule of fileTypeRules) {
const [fileType, services] = rule.split('=');
if (!fileType || !services) continue;

// Split services by comma and validate each service
rules[fileType.toLowerCase()] = services
.split(',')
.map((s) => s.trim().toLowerCase())
.filter((s): s is ChunkingService => ['unstructured', 'doc2x', 'default'].includes(s));
}

return rules;
},
} as const;

0 comments on commit 35fa3ee

Please sign in to comment.