-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathembed.py
50 lines (38 loc) · 3.77 KB
/
embed.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
import spacy
# pip install https://github.com/explosion/spacy-experimental/releases/download/v0.6.1/en_coreference_web_trf-3.4.0a2-py3-none-any.whl
nlp_coref = spacy.load("en_coreference_web_trf")
nlp = spacy.load("en_core_web_sm")
PRONOUNS = ["he", "she", "it", "they", "them", "him", "her", "his", "hers", "its", "their", "theirs", "himself", "herself", "itself", "themselves", "this", "that", "these", "those", "who", "whom", "whose", "which", "what", "where", "when", "why", "how", "all", "any", "anybody", "anyone", "anything", "both", "each", "other", "either", "everyone", "everybody", "everything", "few", "many", "neither", "nobody", "none", "no one", "nothing", "one", "some", "somebody", "someone", "something", "such", "myself", "yourself", "yourselves", "himself", "herself", "itself", "ourselves", "themselves", "whoever", "whomever", "whichever", "whatever", "you", "I", "me", "we", "us", "my", "your", "mine", "yours", "our", "ours", "their", "theirs", "his", "her", "hers", "its", "whose", "what", "which", "that", "this", "these", "those", "all", "any", "anybody", "anyone", "anything", "both", "each", "other", "either", "everyone", "everybody", "everything", "few", "many", "neither", "nobody", "none", "no one", "nothing", "one", "some", "somebody", "someone", "something", "such", "myself", "yourself", "yourselves", "himself", "herself", "itself", "ourselves", "themselves", "whoever", "whomever", "whichever", "whatever", "you", "I", "me", "we", "us", "my", "your", "mine", "yours", "our", "ours", "their", "theirs", "his", "her", "hers", "its", "whose", "what", "which", "that", "this", "these", "those", "all", "any", "anybody", "anyone", "anything", "both", "each", "other", "either", "everyone", "everybody", "everything", "few", "many", "neither", "nobody", "none", "no one", "nothing", "one", "some", "somebody", "someone", "something", "such", "myself", "yourself", "yourselves", "himself", "herself", "itself", "ourselves", "themselves", "whoever", "whomever", "whichever", "whatever", "you", "I", "me", "we", "us", "my", "your", "mine", "yours", "our", "ours", "their", "theirs", "his", "her", "hers", "its", "whose", "what", "which", "that", "this", "these", "those", "all", "any", "anybody", "anyone", "anything", "both", "each", "other", "either", "name"]
class Entity:
def __init__(self, text: str, start: int=-1, end: int=-1, start_char: int=-1, end_char: int=-1):
self.text = text
self.start = start
self.end = end
self.start_char = start_char
self.end_char = end_char
def __str__(self):
return f"{self.text} ({self.start}-{self.end})"
def __repr__(self):
return f"{self.text} ({self.start}-{self.end})"
def __eq__(self, other):
return (str(self.text).lower() in str(other.text).lower() or str(other.text).lower() in str(self.text).lower()) and (self.start <= other.start <= self.end or other.start <= self.start <= other.end)
# Entity Embedding
def entities(text: str) -> list:
# NER or heuristic to recognize entities
doc_coref = nlp_coref(text)
doc = nlp(text)
es = [Entity(ent.text, ent.start, ent.end, ent.start_char, ent.end_char) for ent in doc.ents]
clusters = [val for key, val in doc_coref.spans.items()]
clusters = [([Entity(x, x.start, x.end, x.start_char, x.end_char) for x in cluster]) for cluster in clusters]
for ent in es:
if all(ent not in cluster for cluster in clusters):
clusters.append([ent])
return clusters
if __name__ == "__main__":
from preprocess import COQA_DEV_PATH
from preprocess.coqa_data import parse_coqa
from utils import load_json_file
coqa_data = load_json_file(COQA_DEV_PATH)['data']
coqa_conversations = parse_coqa(coqa_data)
print(len(coqa_conversations))
# print(entities(coqa_conversations[0].context))