-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdebug.py
82 lines (73 loc) · 3.63 KB
/
debug.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
import pandas as pd
import time
import datetime
import scipy.stats as st
import os
from classifier_base_functions import probMultiTfidf,sumTfidf
import pickle
# prepare data
total_msg = pd.read_pickle(r'../records/sample/sample_msg_tfidf_pickle')
# drop useless columns
total_msg.drop(columns=['type', 'status', 'isSend', 'splitwords','msg'], inplace=True)
# test a subsample of total_msg
total_msg.sort_values('createTime',inplace= True)
total_msg['intTime'] = total_msg.createTime.diff().dt.total_seconds().fillna(0).cumsum()
total_msg.sort_values(['talker','createTime'],inplace= True,ascending=True)
total_msg.index = pd.RangeIndex(len(total_msg.index))
total_msg = total_msg.iloc[:1000]
msgat = pd.read_pickle(r'../records/sample/msgat_dataframe')
def autherProb(row, t_scale, contextdf, w_auther):
msgrowid = row.name
sender = row[3]
msgdate = row[7]#intTime
autherContext = contextdf[(contextdf.sender == sender) & (contextdf.index != msgrowid)]
timediff = autherContext['intTime'].values - msgdate# to ndarray
timediff = timediff / divideN
tfidfList = autherContext['tfidf'].tolist()
probArray = st.norm.pdf(timediff, scale=t_scale)
newTfidf = probMultiTfidf(probArray, tfidfList)
result = {k: (w_auther * v) for k, v in sumTfidf(newTfidf).items()}
#write to a file
with open(r'../records/sample/auther_expand_test','a') as f:
f.write("{0},{1}\n".format(msgrowid,result))
def converProb(row, t_scale, contextdf, msgat, w_conver):
msgid = row[1]
sender = row[3]
msgdate = row[7]#intTime
# 我提到了那些人
mentionNames = msgat[msgat.msgSvrId == msgid]['member_x'].tolist()
# 我被那些人提到
mentionNames.extend(msgat[msgat.member_x == sender]['sender'].tolist())
# 这些人说的话
converContext = contextdf[(contextdf.sender.isin(mentionNames))]
timediff = converContext['intTime'].values - msgdate
timediff = timediff / divideN
tfidfList = converContext['tfidf'].tolist()
probArray = st.norm.pdf(timediff, scale=t_scale)
newTfidf = probMultiTfidf(probArray, tfidfList)
result = {k: (v * w_conver) for k, v in sumTfidf(newTfidf).items()}
#write to a file
with open(r'../records/sample/conver_expand_test','a') as f:
f.write("{0},{1}\n".format(msgid,result))
def mExpandAutherConver(msgdate,talker):
maxdate = datetime.datetime(msgdate.year, msgdate.month, msgdate.day) + datetime.timedelta(hours=24 + twindow)
mindate = datetime.datetime(msgdate.year, msgdate.month, msgdate.day) - datetime.timedelta(hours=twindow)
# slice the msgat dataframe with bounded time period and the same talker
msgatSlice = msgat[(msgat.createTime > mindate) & (msgat.createTime < maxdate)&(msgat.talker == talker)]
#还需要限制在同一个聊天室中
contextdf = total_msg[(total_msg.createTime > mindate) & (total_msg.createTime < maxdate) & (total_msg.talker == talker)]
targetdf = total_msg[(total_msg.createTime.dt.date == msgdate) & (total_msg.talker == talker)]
#targetdf.apply(lambda row: [autherProb(row, auther_scale, contextdf, w_auther),converProb(row, conver_scale, contextdf, msgat, w_conver)],axis=1)
targetdf.apply(lambda row:converProb(row, conver_scale, contextdf, msgat, w_conver), axis=1)
twindow = 3 # 这个应该用在全局的totalmsg表上,每次处理相应窗口的数据
divideN = 1500 # 对timediff进行缩放
threshold = 0.6
w_contextfree = 0.14
w_auther = 0.3
w_conver = 0.6
w_temp = 0.1
auther_scale = 2.347
conver_scale = 0.383
temporal_scale = 4.686
#%%
[mExpandAutherConver(msgdate,talker) for msgdate in total_msg['createTime'].dt.date.unique() for talker in total_msg['talker'].unique()]