-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdense_paraphrase.py
94 lines (73 loc) · 2.83 KB
/
dense_paraphrase.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
import os
from typing import List, Tuple, Union
from file_path import DATA_FOLDER
from ingest_dialogue import ingest_dialogue
from preprocess import extract_phase1_valid_utterances
import pandas as pd
BLOCK_ANNOTATION_MAPPING = {
"r": "red block",
"y": "yellow block",
"g": "green block",
"p": "purple block",
"b": "blue block",
"m": "mystery block",
"-": "",
}
def load_annotation_csv(csv_file_path: Union[os.PathLike, str]):
csv_df = pd.read_csv(csv_file_path)
csv_df.fillna("", inplace=True)
annotations = csv_df.Annotation.values.tolist()
return annotations
def parse_block_annotation(annotation_str: str):
comps = annotation_str.split()
comps = [[BLOCK_ANNOTATION_MAPPING[l] for l in c[1:-1]] for c in comps]
return comps
def replace_pronouns(
uttr_txt: str, annotation_str: str, pronoun_lst: List[Tuple[str, int, int]]
) -> List[str]:
block_names = parse_block_annotation(annotation_str)
comps = []
global_s = 0
for i, (pronoun, s, e) in enumerate(pronoun_lst):
comps.append(uttr_txt[global_s:s])
block_name_str = ", ".join(block_names[i])
if block_name_str:
comps.append(block_name_str)
else:
comps.append(pronoun)
global_s = e
if global_s:
comps.append(uttr_txt[global_s:])
return comps
def paraphrase(csv_file_path: Union[os.PathLike, str], group_number: int) -> List[str]:
dialogue = ingest_dialogue(group_number)
uttrs, pronouns = extract_phase1_valid_utterances(dialogue)
annotations = load_annotation_csv(csv_file_path)
dp_text_lst = []
assert len(uttrs) == len(annotations)
for uttr, ann_str, p_lst in zip(uttrs, annotations, pronouns):
if p_lst:
comps = replace_pronouns(uttr.text, ann_str, p_lst)
dp_text = "".join(comps).strip()
else:
dp_text = uttr.text
dp_text_lst.append(dp_text)
return dp_text_lst
def dump_annotation2csv(in_file_path, group_number, out_file_path):
csv_df = pd.read_csv(in_file_path)
dp_text_lst = paraphrase(in_file_path, group_number)
csv_df["DPed"] = dp_text_lst
csv_df.to_csv(out_file_path)
if __name__ == "__main__":
for ann_i in [1, 2]:
for group_i in range(1, 11):
if ann_i == 1:
in_csv_file_path = DATA_FOLDER.joinpath(
f"annotated_a{ann_i}/Group_{str(group_i).zfill(2)}_DP_AG.csv"
)
out_csv_file_path = DATA_FOLDER.joinpath(
f"dped_a{ann_i}/Group_{str(group_i).zfill(2)}_DPed_AG.csv"
)
for file_path in DATA_FOLDER.joinpath("annotated_adjudicated").iterdir():
group_number = file_path.stem.split("_")[1]
dump_annotation2csv(file_path, int(group_number), DATA_FOLDER.joinpath(f"dped_adjudicated/Group_{group_number}_DPed.csv"))