-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmake_cache.py
137 lines (116 loc) · 5.52 KB
/
make_cache.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
"""
computes various cache things on top of db.py so that the server
(running from serve.py) can start up and serve faster when restarted.
this script should be run whenever db.p is updated, and
creates db2.p, which can be read by the server.
"""
import time
import pickle
import dateutil.parser
import string
from utils import safe_pickle_dump, Config, load_json_db
CACHE = {}
IGNORE_WORD = [
'about', 'am', 'an', 'and', 'are', 'as', 'at', 'be', 'by', 'do', 'does', 'for', 'from', 'he', 'in', 'is', 'it', 'me', 'my', 'of', 'on', 'she', 'so', 'the', 'they', 'to', 'under', 'using', 'via', 'we', 'with', 'without', 'you'
]
def clean_title(title):
# Remove punctuation code from: https://datagy.io/python-remove-punctuation-from-string/
title = title.translate(str.maketrans('', '', string.punctuation))
# TODO: use a smarter text processing technique
title_words = [w for w in title.split(' ') if len(w) > 1 and w.lower() not in IGNORE_WORD]
title = ' '.join([s for s in title.split(' ') if len(s) > 1])
return ' '.join(title_words)
def format_booktitle(conf_id, conf_name):
if conf_id[-1] == 'W':
conf_id = conf_id[:-1]
conf_name = conf_name.replace('_workshop', ' workshop')
conf_name = conf_name.replace(conf_id, conf_id[:-4])
return conf_name
print('loading the paper database from json files in', Config.json_dir)
db = load_json_db()
db = {pid: p for pid,p in db.items() if int(dateutil.parser.parse(p['published']).strftime('%Y')) >= Config.minimum_year}
if not Config.include_workshop_papers:
db = {pid: p for pid,p in db.items() if not p['is_workshop']}
print('loading tfidf_meta', Config.meta_path)
meta = pickle.load(open(Config.meta_path, "rb"))
vocab = meta['vocab']
idf = meta['idf']
print('decorating the database with additional information...')
for pid,p in db.items():
p['year'] = p['conf_id'][-4:]
p['pid'] = pid
p['conf_id'] = p['conf_id'] + ('W' if p['is_workshop'] else '')
p['composed_conf_id'] = p['conf_id'] + ('_'+p['conf_sub_id'] if p['is_workshop'] else '')
bib_id_title = clean_title(p['title'])
bib_id_title = bib_id_title.split(' ')
bib_id_title = ''.join(bib_id_title[:3])
p['bib_id'] = f'{p["authors"][0].split(" ")[-1]}{p["published"][:4]}{bib_id_title}'
bib_authors = [a.replace('\n', '').replace('.', '. ').split(' ') for a in p["authors"]]
bib_authors = [[a.strip() for a in authors_list if len(a) > 0] for authors_list in bib_authors]
for authors_list in bib_authors:
for i, aut in enumerate(authors_list[1:-1]):
authors_list[i+1] = f'{aut[0]}.'
bib_authors = [f'{a[-1]}, {" ".join(a[:-1])}' for a in bib_authors]
bib_authors = ' and '.join(bib_authors)
p['bib_authors'] = bib_authors
p['bib_booktitle'] = format_booktitle(p['conf_id'], p['conf_name'])
print('computing min/max time for all papers...')
tts = [time.mktime(dateutil.parser.parse(p['published']).timetuple()) for pid,p in db.items()]
ttmin = min(tts)*1.0
ttmax = max(tts)*1.0
for pid,p in db.items():
tt = time.mktime(dateutil.parser.parse(p['published']).timetuple())
p['tscore'] = (tt-ttmin)/(ttmax-ttmin+1)
print('precomputing conference data...')
composed_conference_ids = set([(p['conf_id'], dateutil.parser.parse(p['published'])) for pid,p in db.items()])
CACHE['conference_sorted_pids'] = {
cid[0]: [pid for pid,p in db.items() if p['conf_id'] == cid[0]] for cid in composed_conference_ids
}
composed_conference_ids = sorted(list(composed_conference_ids), key=lambda x: x[1], reverse=True)
most_recent_conference_idx = composed_conference_ids[0][0]
if most_recent_conference_idx.endswith('W'):
most_recent_conference_idx = most_recent_conference_idx[:-1]
CACHE['most_recent_conference_name'] = most_recent_conference_idx[:-4]
CACHE['newest_conference_year'] = composed_conference_ids[0][1].strftime('%Y')
CACHE['oldest_conference_year'] = composed_conference_ids[-1][1].strftime('%Y')
# some utilities for creating a search index for faster search
punc = "'!\"#$%&\'()*+,./:;<=>?@[\\]^_`{|}~'" # removed hyphen from string.punctuation
trans_table = {ord(c): None for c in punc}
def makedict(s, forceidf=None, scale=1.0):
words = set(s.lower().translate(trans_table).strip().split())
idfd = {}
for w in words: # todo: if we're using bigrams in vocab then this won't search over them
if forceidf is None:
if w in vocab:
# we have idf for this
idfval = idf[vocab[w]]*scale
else:
idfval = 1.0*scale # assume idf 1.0 (low)
else:
idfval = forceidf
idfd[w] = idfval
return idfd
def merge_dicts(dlist):
m = {}
for d in dlist:
for k, v in d.items():
m[k] = m.get(k, 0) + v
return m
print('building an index for faster search...')
search_dict = {}
for pid,p in db.items():
dict_title = makedict(p['title'], forceidf=5, scale=3)
dict_authors = makedict(' '.join(x for x in p['authors']), forceidf=5)
dict_categories = {p['composed_conf_id'].lower(): 5}
dict_conf_name = makedict(p['conf_name'], forceidf=5, scale=3)
if 'and' in dict_authors:
# special case for "and" handling in authors list
del dict_authors['and']
dict_summary = makedict(p['summary'])
search_dict[pid] = merge_dicts([dict_title, dict_authors, dict_categories, dict_summary, dict_conf_name])
CACHE['search_dict'] = search_dict
# save the cache
print('writing', Config.serve_cache_path)
safe_pickle_dump(CACHE, Config.serve_cache_path)
print('writing', Config.db_serve_path)
safe_pickle_dump(db, Config.db_serve_path)