-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdata_normalize.py
89 lines (75 loc) · 2.41 KB
/
data_normalize.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
import norm_utils
import utils
import numpy as np
import pandas as pd
from tqdm import tqdm
pd.options.mode.chained_assignment = None
graph = pd.read_csv('../graph/graph_base.csv')
relatives=pd.read_csv('../graph/relatives.csv')
total_number = graph.shape[0]
# normalize address
address_property=['birthPlace','deathPlace','nationality']
for p in address_property:
pbar = tqdm(total=total_number)
pbar.set_description(p)
for i in range(len(graph[p])):
pbar.update(1)
carry=eval(graph[p][i])
if len(carry)>=1:
graph[p][i]=norm_utils.address_norm(carry[0])
else:
graph[p][i] = {}
# normalize time
time_property=['activeTime','debutTime']
for p in time_property:
pbar = tqdm(total=total_number)
pbar.set_description(p)
for i in range(len(graph[p])):
pbar.update(1)
carry=graph[p][i]
flag=1 if p=='debutTime' else 0
graph[p][i] = norm_utils.time_norm(carry, flag)
# normalize name
name_property=['name','originalName','foreignName','nickname']
for p in name_property:
pbar = tqdm(total=total_number)
pbar.set_description(p)
for i in range(len(graph[p])):
pbar.update(1)
carry=eval(graph[p][i])
if len(carry)>=1:
graph[p][i]=norm_utils.name_norm(carry[0])
else:
graph[p][i] = []
# normalize award
award_property=['award']
for p in award_property:
pbar = tqdm(total=total_number)
pbar.set_description(p)
for i in range(len(graph[p])):
pbar.update(1)
carry=eval(graph[p][i])
if len(carry)>=1:
graph[p][i]=norm_utils.award_norm(carry[0])
else:
graph[p][i] = []
# normalize works
works_property=['notableWork','debutWork']
for p in works_property:
pbar = tqdm(total=total_number)
pbar.set_description(p)
for i in range(len(graph[p])):
pbar.update(1)
carry=eval(graph[p][i])
if len(carry)>=1:
graph[p][i]=norm_utils.works_norm(carry[0])
else:
graph[p][i] = []
# add works property
graph['works'] = graph['notableWork'] + graph['debutWork']
graph['hasSibling'] = relatives['hasRelative']
graph['hasSpouse'] = relatives['hasSpouse']
graph['hasParent'] = relatives['hasParent']
graph['hasChild'] = relatives['hasChild']
graph.to_csv('../graph/graph.csv', encoding='utf_8_sig')
print('Normalize finished')