-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathingest-data.ts
110 lines (100 loc) · 3.16 KB
/
ingest-data.ts
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
import { FaissStore } from './faiss';
import ZipLoader from '@/loaders/zip';
import { HttpsProxyAgent } from 'https-proxy-agent';
import path from 'path';
import { outputDir } from '@/config';
import { getApiConfig, getProxy } from '@/electron/storage';
import fetch from 'node-fetch';
import { Document } from '@/types/document';
import { getCodeDocs, getPdfDocs, getTextDocs } from '@/loaders';
import { default as Embeddings } from '@/electron/embeddings';
const embeddingModel = 'text-embedding-ada-002';
export const supportedLanguages = [
'.cpp',
'.go',
'.java',
'.js',
'.php',
'.proto', //
'.python',
'.rst', //
'.ruby',
'.rust',
'.scala',
'.markdown',
'.md',
'.html',
'.sol',
'.kotlin'
]
export const supportedDocuments = [
...supportedLanguages,
'.pdf',
'.txt',
'.zip'
];
async function getDocuments({ buffer, filename, filePath }: IngestParams): Promise<Document[]> {
if (filePath.endsWith('.pdf')) {
return getPdfDocs({ buffer, filename, filePath });
}
if(filePath.endsWith('.txt')){
return getTextDocs({ buffer: buffer.toString(), filename, filePath })
}
if (filePath.endsWith('.zip')) {
const tasks: Array<Promise<Document[]>> = [];
const files = await new ZipLoader().parse(buffer as Buffer, path => {
return supportedDocuments.reduce((acc, ext) => {
return acc || path.endsWith(ext);
}, false);
});
for (const file of files) {
const { path, content } = file;
if (path.endsWith('.pdf')) {
tasks.push(
getPdfDocs({ buffer: Buffer.from(content), filename, filePath })
);
}
else if(path.endsWith('.txt')){
tasks.push(
getTextDocs({ buffer: content, filename, filePath })
)
}
else {
tasks.push(
getCodeDocs({ buffer: content, filename, filePath })
)
}
}
return Promise.all(tasks).then(docs => {
return docs.flat();
});
}
return getCodeDocs({ buffer: buffer.toString(), filename, filePath })
}
export default async ({ buffer, filename, filePath }: IngestParams) => {
const proxy = getProxy() as string;
const config = getApiConfig();
try {
const docs = await getDocuments({
buffer,
filename,
filePath
});
const vectorStore = await FaissStore.fromDocuments(docs,
new Embeddings({
openAIApiKey: config.apiKey,
modelName: embeddingModel
}, {
httpAgent: proxy ? new HttpsProxyAgent(proxy) : undefined,
// @ts-ignore
fetch,
baseURL: config.baseUrl
}
));
const outputFilePath = path.join(outputDir, filename);
await vectorStore.save(outputFilePath);
} catch (error) {
console.log('error', error);
return Promise.reject(error.code || 'ingest data failed');
}
};