-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathpostprocess_final_dataset.py
93 lines (69 loc) · 2.41 KB
/
postprocess_final_dataset.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
import os
import json
import copy
import random
from tqdm import tqdm
def load_json(datadir):
with open(datadir, 'r', encoding='utf-8') as f:
return json.load(f)
def load_jsonl(datadir: str):
output = []
with open(datadir, 'r', encoding='utf-8') as f:
for line in f.readlines():
output.append(json.loads(line))
return output
def dump_json_output(outputs, file_name):
with open(file_name, 'w', encoding='utf-8') as f:
json.dump(outputs, f, ensure_ascii=False, indent='\t')
def dump_jsonl_output(outputs, file_name=None):
f = open(file_name, 'w', encoding='utf-8')
for output in outputs:
f.write(json.dumps(output) + '\n')
f.close()
def load_dataset():
all_stark = []
for persona_seed_num in range(0, 1):
stark = load_json(os.path.join(f'./Stark/stark_{persona_seed_num}.json'))
all_stark.extend(stark)
return all_stark
def process_dialog(dialog):
dialogue = eval(dialog)
cnt = 0
new_cnt = 0
redialog = []
for item in dialogue:
utter_id = item['utter_id']
speaker = item['speaker']
utter = item['utter']
sharing_info = item['sharing_info']
cp_item = copy.deepcopy(item)
if utter == '' and len(sharing_info) == 0:
cp_item['utter'] = '<non verbal>'
redialog.append(cp_item)
return redialog
def process_dataset(dataset):
re_dataset = []
for instance in dataset:
uuid = instance['unique_id']
name = instance['name']
episode = instance['episode']
cp_instance = copy.deepcopy(instance)
re_epi = []
for idx, session in enumerate(episode):
session_dialog = session[f'session{idx+1}:dialogue']
p_dialog = process_dialog(session_dialog)
cp_session = copy.deepcopy(session)
cp_session[f'session{idx+1}:dialogue'] = p_dialog
re_epi.append(cp_session)
cp_instance['episode'] = re_epi
re_dataset.append(cp_instance)
return re_dataset
#return all_count
if __name__ == '__main__':
dataset = load_dataset()
print(len(dataset))
save_dir = 'Stark/post-process'
os.makedirs(save_dir, exist_ok=True)
processed_dataset = process_dataset(dataset)
print(len(processed_dataset))
dump_json_output(processed_dataset, os.path.join(save_dir, 'stark_0.json'))