✨ feat: Add configurable PDF processing method with Unstructured (lob…

…ehub#5927) * ✨ feat: Add configurable PDF processing method with Unstructured * 🔧 fix: Update import path for env utility in ContentChunk module * feat: add USE_UNSTRUCTURED_FOR_PDF environment variable to knowledge config * Delete src/server/utils/env.ts * feat: implement ChunkingRuleParser for file type and service mapping * refactor: remove USE_UNSTRUCTURED_FOR_PDF from knowledge environment configuration * test: add unit tests for ChunkingRuleParser functionality * refactor: remove isUsingUnstructured method from ContentChunk class * refactor: update ChunkingService type and clean up ContentChunk rules * refactor: simplify ChunkingRuleParser and update ContentChunk module * refactor: update ContentChunk module import for ChunkingService
bentwnghk · Feb 15, 2025 · 35fa3ee · 35fa3ee
1 parent b61cec7
commit 35fa3ee
Show file tree

Hide file tree

Showing 4 changed files with 162 additions and 22 deletions.
diff --git a/src/config/knowledge.ts b/src/config/knowledge.ts
@@ -1,19 +1,17 @@
 import { createEnv } from '@t3-oss/env-nextjs';
 import { z } from 'zod';
 
-export const getKnowledgeConfig = () => {
-  return createEnv({
-    runtimeEnv: {
-      DEFAULT_FILES_CONFIG: process.env.DEFAULT_FILES_CONFIG,
-      UNSTRUCTURED_API_KEY: process.env.UNSTRUCTURED_API_KEY,
-      UNSTRUCTURED_SERVER_URL: process.env.UNSTRUCTURED_SERVER_URL,
-    },
-    server: {
-      DEFAULT_FILES_CONFIG: z.string().optional(),
-      UNSTRUCTURED_API_KEY: z.string().optional(),
-      UNSTRUCTURED_SERVER_URL: z.string().optional(),
-    },
-  });
-};
-
-export const knowledgeEnv = getKnowledgeConfig();
+export const knowledgeEnv = createEnv({
+  runtimeEnv: {
+    DEFAULT_FILES_CONFIG: process.env.DEFAULT_FILES_CONFIG,
+    FILE_TYPE_CHUNKING_RULES: process.env.FILE_TYPE_CHUNKING_RULES,
+    UNSTRUCTURED_API_KEY: process.env.UNSTRUCTURED_API_KEY,
+    UNSTRUCTURED_SERVER_URL: process.env.UNSTRUCTURED_SERVER_URL,
+  },
+  server: {
+    DEFAULT_FILES_CONFIG: z.string().optional(),
+    FILE_TYPE_CHUNKING_RULES: z.string().optional(),
+    UNSTRUCTURED_API_KEY: z.string().optional(),
+    UNSTRUCTURED_SERVER_URL: z.string().optional(),
+  },
+});
diff --git a/src/server/modules/ContentChunk/index.ts b/src/server/modules/ContentChunk/index.ts
@@ -1,9 +1,13 @@
 import { ChunkingLoader } from 'src/libs/langchain';
 import { Strategy } from 'unstructured-client/sdk/models/shared';
 
-import { NewChunkItem, NewUnstructuredChunkItem } from '@/database/schemas';
+import { knowledgeEnv } from '@/config/knowledge';
+import type { NewChunkItem, NewUnstructuredChunkItem } from '@/database/schemas';
 import { ChunkingStrategy, Unstructured } from '@/libs/unstructured';
 
+import { ChunkingRuleParser } from './rules';
+import type { ChunkingService } from './rules';
+
 export interface ChunkContentParams {
   content: Uint8Array;
   fileType: string;
@@ -19,23 +23,57 @@ interface ChunkResult {
 export class ContentChunk {
   private unstructuredClient: Unstructured;
   private langchainClient: ChunkingLoader;
+  private chunkingRules: Record<string, ChunkingService[]>;
 
   constructor() {
     this.unstructuredClient = new Unstructured();
     this.langchainClient = new ChunkingLoader();
+    this.chunkingRules = ChunkingRuleParser.parse(knowledgeEnv.FILE_TYPE_CHUNKING_RULES || '');
   }
 
-  isUsingUnstructured(params: ChunkContentParams) {
-    return params.fileType === 'application/pdf' && params.mode === 'hi-res';
+  private getChunkingServices(fileType: string): ChunkingService[] {
+    const ext = fileType.split('/').pop()?.toLowerCase() || '';
+    return this.chunkingRules[ext] || ['default'];
   }
 
   async chunkContent(params: ChunkContentParams): Promise<ChunkResult> {
-    if (this.isUsingUnstructured(params))
-      return await this.chunkByUnstructured(params.filename, params.content);
-
+    const services = this.getChunkingServices(params.fileType);
+
+    for (const service of services) {
+      try {
+        switch (service) {
+          case 'unstructured': {
+            if (this.canUseUnstructured()) {
+              return await this.chunkByUnstructured(params.filename, params.content);
+            }
+            break;
+          }
+
+          case 'doc2x': {
+            // Future implementation
+            break;
+          }
+
+          default: {
+            return await this.chunkByLangChain(params.filename, params.content);
+          }
+        }
+      } catch (error) {
+        // If this is the last service, throw the error
+        if (service === services.at(-1)) throw error;
+        // Otherwise continue to next service
+        console.error(`Chunking failed with service ${service}:`, error);
+      }
+    }
+
+    // Fallback to langchain if no service succeeded
     return await this.chunkByLangChain(params.filename, params.content);
   }
 
+  private canUseUnstructured(): boolean {
+    return !!(knowledgeEnv.UNSTRUCTURED_API_KEY && knowledgeEnv.UNSTRUCTURED_SERVER_URL);
+  }
+
   private chunkByUnstructured = async (
     filename: string,
     content: Uint8Array,

diff --git a/src/server/modules/ContentChunk/rules.test.ts b/src/server/modules/ContentChunk/rules.test.ts
@@ -0,0 +1,81 @@
+import { describe, expect, it } from 'vitest';
+import { ChunkingRuleParser } from './rules';
+
+describe('ChunkingRuleParser', () => {
+  describe('parse', () => {
+    it('should parse a single file type rule correctly', () => {
+      const input = 'pdf=unstructured,default';
+      const result = ChunkingRuleParser.parse(input);
+
+      expect(result).toEqual({
+        pdf: ['unstructured', 'default'],
+      });
+    });
+
+    it('should parse multiple file type rules correctly', () => {
+      const input = 'pdf=unstructured,default;doc=doc2x,default;txt=default';
+      const result = ChunkingRuleParser.parse(input);
+
+      expect(result).toEqual({
+        pdf: ['unstructured', 'default'],
+        doc: ['doc2x', 'default'],
+        txt: ['default'],
+      });
+    });
+
+    it('should convert file types to lowercase', () => {
+      const input = 'PDF=unstructured;DOC=doc2x';
+      const result = ChunkingRuleParser.parse(input);
+
+      expect(result).toEqual({
+        pdf: ['unstructured'],
+        doc: ['doc2x'],
+      });
+    });
+
+    it('should filter out invalid service names', () => {
+      const input = 'pdf=unstructured,invalid,default,wrongservice';
+      const result = ChunkingRuleParser.parse(input);
+
+      expect(result).toEqual({
+        pdf: ['unstructured', 'default'],
+      });
+    });
+
+    it('should handle empty string input', () => {
+      const input = '';
+      const result = ChunkingRuleParser.parse(input);
+
+      expect(result).toEqual({});
+    });
+
+    it('should skip invalid rule formats', () => {
+      const input = 'pdf=unstructured;invalid;doc=doc2x;=default;txt';
+      const result = ChunkingRuleParser.parse(input);
+
+      expect(result).toEqual({
+        pdf: ['unstructured'],
+        doc: ['doc2x'],
+      });
+    });
+
+    it('should handle whitespace in service names', () => {
+      const input = 'pdf= unstructured , default ;doc=doc2x';
+      const result = ChunkingRuleParser.parse(input);
+
+      expect(result).toEqual({
+        pdf: ['unstructured', 'default'],
+        doc: ['doc2x'],
+      });
+    });
+
+    it('should handle duplicate services for same file type', () => {
+      const input = 'pdf=unstructured,default,unstructured';
+      const result = ChunkingRuleParser.parse(input);
+
+      expect(result).toEqual({
+        pdf: ['unstructured', 'default', 'unstructured'],
+      });
+    });
+  });
+});
diff --git a/src/server/modules/ContentChunk/rules.ts b/src/server/modules/ContentChunk/rules.ts
@@ -0,0 +1,23 @@
+export type ChunkingService = 'unstructured' | 'doc2x' | 'default';
+
+export const ChunkingRuleParser = {
+  parse(rulesStr: string): Record<string, ChunkingService[]> {
+    const rules: Record<string, ChunkingService[]> = {};
+
+    // Split by semicolon for different file types
+    const fileTypeRules = rulesStr.split(';');
+
+    for (const rule of fileTypeRules) {
+      const [fileType, services] = rule.split('=');
+      if (!fileType || !services) continue;
+
+      // Split services by comma and validate each service
+      rules[fileType.toLowerCase()] = services
+        .split(',')
+        .map((s) => s.trim().toLowerCase())
+        .filter((s): s is ChunkingService => ['unstructured', 'doc2x', 'default'].includes(s));
+    }
+
+    return rules;
+  },
+} as const;