forked from lobehub/lobe-chat
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
✨ feat: Add configurable PDF processing method with Unstructured (lob…
…ehub#5927) * ✨ feat: Add configurable PDF processing method with Unstructured * 🔧 fix: Update import path for env utility in ContentChunk module * feat: add USE_UNSTRUCTURED_FOR_PDF environment variable to knowledge config * Delete src/server/utils/env.ts * feat: implement ChunkingRuleParser for file type and service mapping * refactor: remove USE_UNSTRUCTURED_FOR_PDF from knowledge environment configuration * test: add unit tests for ChunkingRuleParser functionality * refactor: remove isUsingUnstructured method from ContentChunk class * refactor: update ChunkingService type and clean up ContentChunk rules * refactor: simplify ChunkingRuleParser and update ContentChunk module * refactor: update ContentChunk module import for ChunkingService
- Loading branch information
Showing
4 changed files
with
162 additions
and
22 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,19 +1,17 @@ | ||
import { createEnv } from '@t3-oss/env-nextjs'; | ||
import { z } from 'zod'; | ||
|
||
export const getKnowledgeConfig = () => { | ||
return createEnv({ | ||
runtimeEnv: { | ||
DEFAULT_FILES_CONFIG: process.env.DEFAULT_FILES_CONFIG, | ||
UNSTRUCTURED_API_KEY: process.env.UNSTRUCTURED_API_KEY, | ||
UNSTRUCTURED_SERVER_URL: process.env.UNSTRUCTURED_SERVER_URL, | ||
}, | ||
server: { | ||
DEFAULT_FILES_CONFIG: z.string().optional(), | ||
UNSTRUCTURED_API_KEY: z.string().optional(), | ||
UNSTRUCTURED_SERVER_URL: z.string().optional(), | ||
}, | ||
}); | ||
}; | ||
|
||
export const knowledgeEnv = getKnowledgeConfig(); | ||
export const knowledgeEnv = createEnv({ | ||
runtimeEnv: { | ||
DEFAULT_FILES_CONFIG: process.env.DEFAULT_FILES_CONFIG, | ||
FILE_TYPE_CHUNKING_RULES: process.env.FILE_TYPE_CHUNKING_RULES, | ||
UNSTRUCTURED_API_KEY: process.env.UNSTRUCTURED_API_KEY, | ||
UNSTRUCTURED_SERVER_URL: process.env.UNSTRUCTURED_SERVER_URL, | ||
}, | ||
server: { | ||
DEFAULT_FILES_CONFIG: z.string().optional(), | ||
FILE_TYPE_CHUNKING_RULES: z.string().optional(), | ||
UNSTRUCTURED_API_KEY: z.string().optional(), | ||
UNSTRUCTURED_SERVER_URL: z.string().optional(), | ||
}, | ||
}); |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,81 @@ | ||
import { describe, expect, it } from 'vitest'; | ||
import { ChunkingRuleParser } from './rules'; | ||
|
||
describe('ChunkingRuleParser', () => { | ||
describe('parse', () => { | ||
it('should parse a single file type rule correctly', () => { | ||
const input = 'pdf=unstructured,default'; | ||
const result = ChunkingRuleParser.parse(input); | ||
|
||
expect(result).toEqual({ | ||
pdf: ['unstructured', 'default'], | ||
}); | ||
}); | ||
|
||
it('should parse multiple file type rules correctly', () => { | ||
const input = 'pdf=unstructured,default;doc=doc2x,default;txt=default'; | ||
const result = ChunkingRuleParser.parse(input); | ||
|
||
expect(result).toEqual({ | ||
pdf: ['unstructured', 'default'], | ||
doc: ['doc2x', 'default'], | ||
txt: ['default'], | ||
}); | ||
}); | ||
|
||
it('should convert file types to lowercase', () => { | ||
const input = 'PDF=unstructured;DOC=doc2x'; | ||
const result = ChunkingRuleParser.parse(input); | ||
|
||
expect(result).toEqual({ | ||
pdf: ['unstructured'], | ||
doc: ['doc2x'], | ||
}); | ||
}); | ||
|
||
it('should filter out invalid service names', () => { | ||
const input = 'pdf=unstructured,invalid,default,wrongservice'; | ||
const result = ChunkingRuleParser.parse(input); | ||
|
||
expect(result).toEqual({ | ||
pdf: ['unstructured', 'default'], | ||
}); | ||
}); | ||
|
||
it('should handle empty string input', () => { | ||
const input = ''; | ||
const result = ChunkingRuleParser.parse(input); | ||
|
||
expect(result).toEqual({}); | ||
}); | ||
|
||
it('should skip invalid rule formats', () => { | ||
const input = 'pdf=unstructured;invalid;doc=doc2x;=default;txt'; | ||
const result = ChunkingRuleParser.parse(input); | ||
|
||
expect(result).toEqual({ | ||
pdf: ['unstructured'], | ||
doc: ['doc2x'], | ||
}); | ||
}); | ||
|
||
it('should handle whitespace in service names', () => { | ||
const input = 'pdf= unstructured , default ;doc=doc2x'; | ||
const result = ChunkingRuleParser.parse(input); | ||
|
||
expect(result).toEqual({ | ||
pdf: ['unstructured', 'default'], | ||
doc: ['doc2x'], | ||
}); | ||
}); | ||
|
||
it('should handle duplicate services for same file type', () => { | ||
const input = 'pdf=unstructured,default,unstructured'; | ||
const result = ChunkingRuleParser.parse(input); | ||
|
||
expect(result).toEqual({ | ||
pdf: ['unstructured', 'default', 'unstructured'], | ||
}); | ||
}); | ||
}); | ||
}); |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,23 @@ | ||
export type ChunkingService = 'unstructured' | 'doc2x' | 'default'; | ||
|
||
export const ChunkingRuleParser = { | ||
parse(rulesStr: string): Record<string, ChunkingService[]> { | ||
const rules: Record<string, ChunkingService[]> = {}; | ||
|
||
// Split by semicolon for different file types | ||
const fileTypeRules = rulesStr.split(';'); | ||
|
||
for (const rule of fileTypeRules) { | ||
const [fileType, services] = rule.split('='); | ||
if (!fileType || !services) continue; | ||
|
||
// Split services by comma and validate each service | ||
rules[fileType.toLowerCase()] = services | ||
.split(',') | ||
.map((s) => s.trim().toLowerCase()) | ||
.filter((s): s is ChunkingService => ['unstructured', 'doc2x', 'default'].includes(s)); | ||
} | ||
|
||
return rules; | ||
}, | ||
} as const; |