diff --git a/src/config/knowledge.ts b/src/config/knowledge.ts index 02f79a5af19ec..50e5004d28efb 100644 --- a/src/config/knowledge.ts +++ b/src/config/knowledge.ts @@ -1,19 +1,17 @@ import { createEnv } from '@t3-oss/env-nextjs'; import { z } from 'zod'; -export const getKnowledgeConfig = () => { - return createEnv({ - runtimeEnv: { - DEFAULT_FILES_CONFIG: process.env.DEFAULT_FILES_CONFIG, - UNSTRUCTURED_API_KEY: process.env.UNSTRUCTURED_API_KEY, - UNSTRUCTURED_SERVER_URL: process.env.UNSTRUCTURED_SERVER_URL, - }, - server: { - DEFAULT_FILES_CONFIG: z.string().optional(), - UNSTRUCTURED_API_KEY: z.string().optional(), - UNSTRUCTURED_SERVER_URL: z.string().optional(), - }, - }); -}; - -export const knowledgeEnv = getKnowledgeConfig(); +export const knowledgeEnv = createEnv({ + runtimeEnv: { + DEFAULT_FILES_CONFIG: process.env.DEFAULT_FILES_CONFIG, + FILE_TYPE_CHUNKING_RULES: process.env.FILE_TYPE_CHUNKING_RULES, + UNSTRUCTURED_API_KEY: process.env.UNSTRUCTURED_API_KEY, + UNSTRUCTURED_SERVER_URL: process.env.UNSTRUCTURED_SERVER_URL, + }, + server: { + DEFAULT_FILES_CONFIG: z.string().optional(), + FILE_TYPE_CHUNKING_RULES: z.string().optional(), + UNSTRUCTURED_API_KEY: z.string().optional(), + UNSTRUCTURED_SERVER_URL: z.string().optional(), + }, +}); diff --git a/src/server/modules/ContentChunk/index.ts b/src/server/modules/ContentChunk/index.ts index ce61616968cb8..79ec0bdd14dfe 100644 --- a/src/server/modules/ContentChunk/index.ts +++ b/src/server/modules/ContentChunk/index.ts @@ -1,9 +1,13 @@ import { ChunkingLoader } from 'src/libs/langchain'; import { Strategy } from 'unstructured-client/sdk/models/shared'; -import { NewChunkItem, NewUnstructuredChunkItem } from '@/database/schemas'; +import { knowledgeEnv } from '@/config/knowledge'; +import type { NewChunkItem, NewUnstructuredChunkItem } from '@/database/schemas'; import { ChunkingStrategy, Unstructured } from '@/libs/unstructured'; +import { ChunkingRuleParser } from './rules'; +import type { ChunkingService } from './rules'; + export interface ChunkContentParams { content: Uint8Array; fileType: string; @@ -19,23 +23,57 @@ interface ChunkResult { export class ContentChunk { private unstructuredClient: Unstructured; private langchainClient: ChunkingLoader; + private chunkingRules: Record; constructor() { this.unstructuredClient = new Unstructured(); this.langchainClient = new ChunkingLoader(); + this.chunkingRules = ChunkingRuleParser.parse(knowledgeEnv.FILE_TYPE_CHUNKING_RULES || ''); } - isUsingUnstructured(params: ChunkContentParams) { - return params.fileType === 'application/pdf' && params.mode === 'hi-res'; + private getChunkingServices(fileType: string): ChunkingService[] { + const ext = fileType.split('/').pop()?.toLowerCase() || ''; + return this.chunkingRules[ext] || ['default']; } async chunkContent(params: ChunkContentParams): Promise { - if (this.isUsingUnstructured(params)) - return await this.chunkByUnstructured(params.filename, params.content); - + const services = this.getChunkingServices(params.fileType); + + for (const service of services) { + try { + switch (service) { + case 'unstructured': { + if (this.canUseUnstructured()) { + return await this.chunkByUnstructured(params.filename, params.content); + } + break; + } + + case 'doc2x': { + // Future implementation + break; + } + + default: { + return await this.chunkByLangChain(params.filename, params.content); + } + } + } catch (error) { + // If this is the last service, throw the error + if (service === services.at(-1)) throw error; + // Otherwise continue to next service + console.error(`Chunking failed with service ${service}:`, error); + } + } + + // Fallback to langchain if no service succeeded return await this.chunkByLangChain(params.filename, params.content); } + private canUseUnstructured(): boolean { + return !!(knowledgeEnv.UNSTRUCTURED_API_KEY && knowledgeEnv.UNSTRUCTURED_SERVER_URL); + } + private chunkByUnstructured = async ( filename: string, content: Uint8Array, diff --git a/src/server/modules/ContentChunk/rules.test.ts b/src/server/modules/ContentChunk/rules.test.ts new file mode 100644 index 0000000000000..19ec59c793b1b --- /dev/null +++ b/src/server/modules/ContentChunk/rules.test.ts @@ -0,0 +1,81 @@ +import { describe, expect, it } from 'vitest'; +import { ChunkingRuleParser } from './rules'; + +describe('ChunkingRuleParser', () => { + describe('parse', () => { + it('should parse a single file type rule correctly', () => { + const input = 'pdf=unstructured,default'; + const result = ChunkingRuleParser.parse(input); + + expect(result).toEqual({ + pdf: ['unstructured', 'default'], + }); + }); + + it('should parse multiple file type rules correctly', () => { + const input = 'pdf=unstructured,default;doc=doc2x,default;txt=default'; + const result = ChunkingRuleParser.parse(input); + + expect(result).toEqual({ + pdf: ['unstructured', 'default'], + doc: ['doc2x', 'default'], + txt: ['default'], + }); + }); + + it('should convert file types to lowercase', () => { + const input = 'PDF=unstructured;DOC=doc2x'; + const result = ChunkingRuleParser.parse(input); + + expect(result).toEqual({ + pdf: ['unstructured'], + doc: ['doc2x'], + }); + }); + + it('should filter out invalid service names', () => { + const input = 'pdf=unstructured,invalid,default,wrongservice'; + const result = ChunkingRuleParser.parse(input); + + expect(result).toEqual({ + pdf: ['unstructured', 'default'], + }); + }); + + it('should handle empty string input', () => { + const input = ''; + const result = ChunkingRuleParser.parse(input); + + expect(result).toEqual({}); + }); + + it('should skip invalid rule formats', () => { + const input = 'pdf=unstructured;invalid;doc=doc2x;=default;txt'; + const result = ChunkingRuleParser.parse(input); + + expect(result).toEqual({ + pdf: ['unstructured'], + doc: ['doc2x'], + }); + }); + + it('should handle whitespace in service names', () => { + const input = 'pdf= unstructured , default ;doc=doc2x'; + const result = ChunkingRuleParser.parse(input); + + expect(result).toEqual({ + pdf: ['unstructured', 'default'], + doc: ['doc2x'], + }); + }); + + it('should handle duplicate services for same file type', () => { + const input = 'pdf=unstructured,default,unstructured'; + const result = ChunkingRuleParser.parse(input); + + expect(result).toEqual({ + pdf: ['unstructured', 'default', 'unstructured'], + }); + }); + }); +}); diff --git a/src/server/modules/ContentChunk/rules.ts b/src/server/modules/ContentChunk/rules.ts new file mode 100644 index 0000000000000..aa0c0a704b4f6 --- /dev/null +++ b/src/server/modules/ContentChunk/rules.ts @@ -0,0 +1,23 @@ +export type ChunkingService = 'unstructured' | 'doc2x' | 'default'; + +export const ChunkingRuleParser = { + parse(rulesStr: string): Record { + const rules: Record = {}; + + // Split by semicolon for different file types + const fileTypeRules = rulesStr.split(';'); + + for (const rule of fileTypeRules) { + const [fileType, services] = rule.split('='); + if (!fileType || !services) continue; + + // Split services by comma and validate each service + rules[fileType.toLowerCase()] = services + .split(',') + .map((s) => s.trim().toLowerCase()) + .filter((s): s is ChunkingService => ['unstructured', 'doc2x', 'default'].includes(s)); + } + + return rules; + }, +} as const;