-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcheck_cs_dic.py
147 lines (129 loc) · 5.11 KB
/
check_cs_dic.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
# -*- coding:utf-8 -*-
import pandas as pd
import json
from __init__ import LoadDialogueDataset
from Literal_generator import literal_generation_for_evaluation
import pandas as pd
from sklearn.model_selection import train_test_split
from sentiment_check import find_sentiment
import re
def load_dialogue_dataset():
"""not use dialogue dataset"""
# with open(LoadDialogueDataset.DIA_PATH, 'r') as f:
# all = f.read().splitlines()
#
# dias = []
# for i in range(len(all)):
# dia = all[i].split("__eou__")
# dias.append(dia[:-1])
# # print(dia)
#
# with open(LoadDialogueDataset.DIA_JSON_PATH, 'w') as w:
# json.dump(dias, w)
# with open(LoadDialogueDataset.DIA_JSON_PATH, 'r') as json_file:
# data = json.load(json_file)
# # literal_generation_for_evaluation(data)
# temp_list = []
# length_d = len(data)
# print("all data:", length_d)
# # data = data[7:10]
#
# for j in range(len(data)):
# flag = 0
# print("data", j)
# if len(data) == 2:
# flag = -1
# elif len(data) < 2:
# flag = -1
# else:
# list_len = len(data[j])
# for i in reversed(range(list_len)):
# if i != 0 and i != list_len-1:
# sentence_list = data[j][i].split(' ')
# find_question = re.findall(r'[/?]', data[j][i])
# if len(find_question) == 0 and len(sentence_list) >= 10:
# flag = i + 1
# break
# # if flag == 0:
#
# print('flag', flag)
# for each in data[j]:
# print(each)
#
# temp_list.append([data[j], flag])
# print()
#
# df = pd.DataFrame(data=temp_list, columns=["data", "flag"])
# df.to_json("dialogue_for_train_daily_question.json", orient="values")
# df = pd.read_json("dialogue_for_train_daily_question.json", orient="values")
# def test(row):
# # for each in row[0]:
# flag = row[1]
# sentence_list = row[0]
# literal_generation_for_evaluation(sentence_list[:flag+1])
# print()
#
# df = df.loc[df[1] != 0]
# df.apply(test, axis=1)
# df.to_csv("dialogue_for_train_daily_robot_literal.csv")
# print()
"""use sarcasm detection shared task conversation"""
df = pd.read_csv(LoadDialogueDataset.SARC_DATASET_PATH)
def merge_label(input):
out = 0 # literal
if input['rq'] == 1 or (input['sarcasm'] == 1 and input['question'] == 1):
out = 1 # rq
elif input['sarcasm'] == 1:
out = 2 # sarcasm
return out
df['tri_label'] = df[['sarcasm', 'question', 'rq']].apply(merge_label, axis=1)
train_set, test_set = train_test_split(df,
stratify=df['tri_label'],
test_size=0.2,
random_state=43)
print(test_set.shape[0])
test_set.reset_index(drop=True)
data_list = test_set[["sarcasm", "question", "rq", "context_2", "context_1", "response"]].values.tolist()
result_all = []
count = 0
for one_dialogue in data_list:
count += 1
print("No.{} | sarcasm {} | question {} | rq {}".format(count, one_dialogue[0], one_dialogue[1], one_dialogue[2]))
result = literal_generation_for_evaluation(one_dialogue)
senti = find_sentiment(result[-1])
# make sure that dot is not in the middle of the sentences
check_dot = re.findall(r'\. *[a-zA-Z0-9]', result[-1])
# plan to delete literal question
check_question = re.findall(r'\?', result[-1])
# check legth of literal generation
length_check = re.sub(r"[^\w]+", " ", result[-1])
length_check = re.sub(r" *$", "", length_check)
length = len(length_check.split(" "))
if senti[0][0] > 0.8 and senti[0][1] <= 0.8:
result.append("pos")
elif senti[0][1] > 0.8:
result.append("neg")
else:
result.append("neu")
result.append(length)
if check_dot:
result.append(1)
else:
result.append(0)
if check_question:
result.append(1)
else:
result.append(0)
result_all.append(result)
df_out = pd.DataFrame(result_all, columns=["sarcasm", "question", "rq", "context_2", "context_1", "response",
"robot_res", "sentiment", "len", "dot", "is_question"])
df_out.to_csv("dialogue_for_train_2.csv", index=False)
df = pd.read_csv("dialogue_for_train.csv")
df_proc = df.loc[(df['len'] >= 8) & (df['dot'] == 0) & (df['question'] == 0)]
def delete_question(input):
check_question = re.findall(r'\?', input['robot_res'])
df_proc = df_proc.apply(delete_question, axis=1)
print()
if __name__ == '__main__':
"""laod conversation dataset for evaluation and commonsense dictionary"""
load_dialogue_dataset()