-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathchatMsg.py
179 lines (150 loc) · 6.5 KB
/
chatMsg.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
import json
import os
import pandas as pd
import pymysql
from dotenv import dotenv_values
from sqlalchemy import create_engine
def process_chat_data(file_name, encoding='utf-8'):
# 检查文件是否存在
if not os.path.isfile(file_name):
print(f"错误:文件 {file_name} 未找到。")
return # 如果文件不存在,则直接返回
# 打开 JSONL 文件,并逐行读取数据
print(f"开始加载文件{file_name}……💕")
with open(file_name, 'r', encoding='utf-8', errors='replace') as f:
# 定义一个空列表用于存储数据
data_list = []
data_text = []
data_image = []
data_voice = []
data_video = []
data_file = []
data_call = []
for i, line in enumerate(f):
# 输出行号和行内容(诊断错误)
print(f"Line {i+1}: {line}")
try:
# 将 JSON 字符串转换为 Python 对象
data = json.loads(line)
except json.JSONDecodeError:
# 如果不是有效的 JSON 数据,则跳过该行
continue
# 判断 action 是否为 send
if data.get('action') == 'send':
# 选择需要的字段
selected_data = {
'msgid': data['msgid'],
'from': data['from'],
'tolist': '|'.join(data.get('tolist', [])),
'roomid': data.get('roomid', ''),
'msgtime': data['msgtime'],
'msgtype': data['msgtype']
}
# 将选择的数据添加到列表中
data_list.append(selected_data)
# 处理不同类型数据
if data.get('msgtype') == 'text':
selected_data = {
'msgid': data['msgid'],
'content': data['text']['content']
}
data_text.append(selected_data)
if data.get('msgtype') == 'image':
selected_data = {
'msgid': data['msgid'],
'md5sum': data['image']['md5sum'],
'filesize': data['image']['filesize'],
'sdkfileid': data['image']['sdkfileid']
}
data_image.append(selected_data)
if data.get('msgtype') == 'voice':
selected_data = {
'msgid': data['msgid'],
'md5sum': data['voice']['md5sum'],
'voice_size': data['voice']['voice_size'],
'play_length': data['voice']['play_length'],
'sdkfileid': data['voice']['sdkfileid']
}
data_voice.append(selected_data)
if data.get('msgtype') == 'video':
selected_data = {
'msgid': data['msgid'],
'md5sum': data['video']['md5sum'],
'filesize': data['video']['filesize'],
'play_length': data['video']['play_length'],
'sdkfileid': data['video']['sdkfileid']
}
data_video.append(selected_data)
if data.get('msgtype') == 'file':
selected_data = {
'msgid': data['msgid'],
'md5sum': data['file']['md5sum'],
'filename': data['file']['filename'],
'fileext': data['file']['fileext'],
'filesize': data['file']['filesize'],
'sdkfileid': data['file']['sdkfileid']
}
data_file.append(selected_data)
if data.get('msgtype') == 'meeting_voice_call':
selected_data = {
'msgid': data['msgid'],
'voiceid': data['voiceid'],
'endtime': data['meeting_voice_call']['endtime'],
'sdkfileid': data['meeting_voice_call']['sdkfileid']
}
data_call.append(selected_data)
print("数据处理中……💕")
# 创建空的 DataFrame
df = pd.DataFrame()
# 将列表转换为 pandas DataFrame 格式
df = pd.DataFrame(data_list)
# 去重
df.drop_duplicates(inplace=True)
# 将 DataFrame 写入 Excel 文件
write_to_excel(df, "chat_list.xlsx", 'list')
# 将数据存入 MySQL 数据库
db_config = dotenv_values('.env')
save_to_mysql(df, 'chat_list', db_config)
# 定义消息类型
msg_types = ['text', 'image', 'voice', 'video', 'file', 'call']
for type in msg_types:
# 获取相应类型的数据
data = locals()[f"data_{type}"]
# 创建 DataFrame
df = pd.DataFrame(data)
# 去重
df.drop_duplicates(inplace=True)
# 将 DataFrame 写入 Excel 文件
write_to_excel(df, f"chat_{type}.xlsx", type)
# 将数据存入 MySQL 数据库
save_to_mysql(df, f'chat_{type}', db_config)
def save_to_mysql(df, table_name, db_config):
if db_config.get('SAVE_TO_DB') == 'True':
try:
# 创建 MySQL 数据库连接字符串
connection_string = f"mysql+pymysql://{db_config['DB_USER']}:{db_config['DB_PASSWORD']}@{db_config['DB_HOST']}:{db_config['DB_PORT']}/{db_config['DB_NAME']}"
# 创建数据库连接引擎
engine = create_engine(connection_string)
# 将 DataFrame 写入 MySQL 数据库的表中
if not df.empty:
df.to_sql(name=table_name, con=engine, if_exists='append', index=False)
else:
print(f"{table_name} 是空的,跳过保存到数据库。")
# 关闭数据库连接
engine.dispose()
# 打印结果
print(f'{table_name} 数据已保存到 MySQL 数据库 ✔')
except pymysql.Error as e:
print(f"数据库连接错误:{e}")
def write_to_excel(df, file_name, sheet_name):
# 将 DataFrame 写入 Excel 文件
with pd.ExcelWriter(file_name) as writer:
df.to_excel(writer, sheet_name=sheet_name, index=False)
# 打印结果
print(f'{file_name} 已保存 ✔')
if __name__ == '__main__':
import sys
if len(sys.argv) > 1:
process_chat_data(sys.argv[1])
else:
process_chat_data('chatdata.jsonl')