-
Notifications
You must be signed in to change notification settings - Fork 12
/
Copy pathmulticore_run.py
57 lines (43 loc) · 1.93 KB
/
multicore_run.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
import multiprocessing
from pipeline.entitylinker import *
from pipeline.triplealigner import *
from pipeline.datareader import DBpediaAbstractsDataReader
from pipeline.writer import JsonWriter
from pipeline.coreference import *
from utils.triplereader import *
import argparse
__START_DOC__ = 0 #start reading from document number
__CORES__ = 7
# Reading the DBpedia Abstracts Dataset
reader = DBpediaAbstractsDataReader('./datasets/wikipedia-abstracts/csv/dbpedia-abstracts.csv', db_wd_mapping='./datasets/wikidata/dbpedia-wikidata-sameas-dict.csv', skip=__START_DOC__)
# Loading the WikidataSpotlightEntityLinker ... DBpedia Spotlight with mapping DBpedia URIs to Wikidata
link = WikidataSpotlightEntityLinker('./datasets/wikidata/dbpedia-wikidata-sameas-dict.csv', support=10, confidence=0.4)
coref = SimpleCoreference()
trip_read = TripleReader('./datasets/wikidata/wikidata-triples.csv')
Salign = SimpleAligner(trip_read)
prop = WikidataPropertyLinker('./datasets/wikidata/wikidata-properties.csv')
date = DateLinker()
SPOalign = SPOAligner(trip_read)
NSalign = NoSubjectAlign(trip_read)
writer = JsonWriter('./out', "re-nlg", startfile=__START_DOC__)
def reading_documents():
# reading document and apply all non parallelizable process
for d in reader.read_documents():
d = date.run(d) # SU Time is non parallelizable
yield d
def multhithreadprocess(d):
try:
d = link.run(d)
d = NSalign.run(d)
d = coref.run(d)
d = date.run(d)
d = Salign.run(d)
d = prop.run(d)
d = SPOalign.run(d)
writer.run(d)
print "Document Title: %s \t Number of Annotated Entities %s \t Number of Annotated Triples %s" % (d.title, len(d.entities), len(d.triples))
except Exception as e:
print "error Processing document %s" % d.title
if __name__ == '__main__':
p = multiprocessing.Pool(__CORES__)
p.map(multhithreadprocess, reading_documents())