-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathformat_converter.py
58 lines (48 loc) · 1.85 KB
/
format_converter.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
import json
import os
from llama_index.core.schema import TextNode, NodeRelationship, RelatedNodeInfo
from llama_index.core import Document
def nodefile2node(input_file):
nodes = [TextNode.from_dict(doc) for doc in json.load(open(input_file, 'r'))]
return nodes
def onlchunkfile2node(input_file):
content_json = json.load(open(input_file, 'r'))
nodes = []
for data in content_json:
node = TextNode(text=data['title'] + data.get('hier_title', '') + data['content'], file_name=input_file)
nodes.append(node)
if len(nodes) > 1:
nodes[-1].relationships[NodeRelationship.PREVIOUS] = RelatedNodeInfo(
node_id=nodes[-2].node_id
)
nodes[-2].relationships[NodeRelationship.NEXT] = RelatedNodeInfo(
node_id=nodes[-1].node_id
)
return nodes
def transform_idp2markdown(response_json: dict) -> str:
# 初始化Markdown字符串
markdown_text = ""
if 'layouts' in response_json:
response_json = response_json['layouts']
# 遍历layouts数组
for layout in response_json:
if layout is None:
continue
if not 'subType' in layout:
layout['subType'] = 'para'
# 根据类型设置Markdown格式
if layout["type"] == "title":
# 文档标题使用一级标题
markdown_text += "\n\n\n" + layout["text"] + '\n'
else:
# 正文使用段落格式
markdown_text += layout["text"] + "\n"
return markdown_text
def documentfile2document(input_file):
documents = [Document.from_dict(doc) for doc in json.load(open(input_file, 'r'))]
return documents
def text2document(input_file):
text = open(input_file, 'r').read()
metadata = {"file_name": input_file}
documents = [Document(text=text, metadata=metadata)]
return documents