-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathfact_extract.py
71 lines (57 loc) · 2.38 KB
/
fact_extract.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
import pandas as pd
import numpy as np
import utils
from tqdm import tqdm
import os
import regex
from improved_extract import improved_extract
root_dir = '../filtered_data/'
ontology = pd.read_excel('../graph/ontology.xlsx')
fact = {}
data_list=['actor','writer','director']
counters = {'infobox':0, 're':0, 'jiagu':0}
for data in data_list:
data_dir=root_dir+data
count = len([lists for lists in os.listdir(data_dir) if os.path.isfile(os.path.join(data_dir, lists))])
pbar = tqdm(total=count)
pbar.set_description("Processing "+data+" pages")
for f in os.listdir(data_dir):
page_dir = os.path.join(data_dir, f)
pbar.update(1)
page_fact = {}
page = utils.page_extract(page_dir)
infobox = utils.infobox_extract(page['text'])
page_fact['name'] = [page['title']]
rule={'actor':[['电影','演员'],['协会','处女作','奖']],
'writer':[['电影','编剧'],['协会','处女作','奖','编剧电影']],
'director':[['电影','导演'],['协会','处女作','奖','导演电影']],
}
page_fact['type'] = utils.type(page_dir, rule)
for i in range(ontology.shape[0]):
carry = ontology.loc[i]
p = carry[0]
fact_keys = carry[1].split(';')
re_list = [] if type(carry[2]) == float else carry[2].split(';')
for info in infobox:
info_keys = list(info.keys())
for key in info_keys:
if key in fact_keys:
page_fact[p] = [info[key]]
if p not in page_fact.keys():
page_fact[p] = []
for pattern in re_list:
re_result = regex.finditer(pattern, page['text'])
for item in re_result:
page_fact[p].append(item.group())
# extract by jiagu
triples = improved_extract(page['text'])
for triple in triples:
for i in range(ontology.shape[0]):
carry = ontology.loc[i]
p = carry[0]
fact_keys = carry[1].split(';')
if triple[1] in fact_keys and len(page_fact[p]) == 0:
page_fact[p] = [triple[2]]
fact[page['id']] = page_fact
fact_df=pd.DataFrame(fact).T
fact_df.to_csv('../graph/graph_base.csv', encoding='utf_8_sig')