-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathanalyze.py
163 lines (141 loc) · 6.04 KB
/
analyze.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
"""
Reads txt files of all papers and computes tfidf vectors for all papers.
Dumps results to file tfidf.p
"""
import dateutil.parser
import os
import pickle
from collections import namedtuple
from random import shuffle, seed
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from utils import Config, safe_pickle_dump, load_json_db
PidConf = namedtuple('PidConf', 'pid, name, year, subid')
seed(1337)
max_train = 50000 # max number of tfidf training documents (chosen randomly), for memory efficiency
max_features = 5000
# read database
db = load_json_db()
# read all text files for all papers into memory
txt_paths, pids = [], []
pid_confs = []
n = 0
# for pid,j in db.items():
for key in db:
n += 1
basename = db[key]['pdf_url'].split('/')[-1]
txt_path = os.path.join(
Config.txt_dir, db[key]['conf_id'], db[key]['conf_sub_id'],
basename) + '.txt'
if os.path.isfile(txt_path): # some pdfs dont translate to txt
pub_year = int(dateutil.parser.parse(db[key]['published']).strftime('%Y'))
if not Config.include_workshop_papers and db[key]['is_workshop']:
print("skipped %d/%d (%s): not using workshops" % (n, len(db), key))
elif pub_year < Config.minimum_year:
print("skipped %d/%d (%s): older than minimum year" % (n, len(db), key))
else:
with open(txt_path, 'r') as f:
txt = f.read()
if len(txt) > 1000 and len(txt) < 500000: # 500K is VERY conservative upper bound
txt_paths.append(txt_path) # todo later: maybe filter or something some of them
pids.append(key)
conf_id = db[key]['conf_id']
pid_confs.append(PidConf(key, conf_id[:-4], conf_id[-4:], db[key]['conf_sub_id'].lower()))
print("read %d/%d (%s) with %d chars" % (n, len(db), key, len(txt)))
else:
print("skipped %d/%d (%s) with %d chars: suspicious!" % (n, len(db), key, len(txt)))
# else:
# print("could not find %s in txt folder." % (txt_path, ))
print("in total read in %d text files out of %d db entries." % (len(txt_paths), len(db)))
# compute tfidf vectors with scikits
v = TfidfVectorizer(
input='content',
encoding='utf-8', decode_error='replace', strip_accents='unicode',
lowercase=True, analyzer='word', stop_words='english',
token_pattern=r'(?u)\b[a-zA-Z_][a-zA-Z0-9_]+\b',
ngram_range=(1, 2), max_features = max_features,
norm='l2', use_idf=True, smooth_idf=True, sublinear_tf=True,
max_df=1.0, min_df=1)
# create an iterator object to conserve memory
def make_corpus(paths):
for p in paths:
with open(p, 'r') as f:
txt = f.read()
yield txt
# train
train_txt_paths = list(txt_paths) # duplicate
shuffle(train_txt_paths) # shuffle
train_txt_paths = train_txt_paths[:min(len(train_txt_paths), max_train)] # crop
print("training on %d documents..." % (len(train_txt_paths), ))
train_corpus = make_corpus(train_txt_paths)
v.fit(train_corpus)
# transform
print("transforming %d documents..." % (len(txt_paths), ))
corpus = make_corpus(txt_paths)
X = v.transform(corpus)
print(v.vocabulary_)
print(X.shape)
# write full matrix out
out = {}
out['X'] = X # this one is heavy!
print("writing", Config.tfidf_path)
safe_pickle_dump(out, Config.tfidf_path)
# writing lighter metadata information into a separate (smaller) file
out = {}
out['vocab'] = v.vocabulary_
out['idf'] = v._tfidf.idf_
out['pids'] = pids # a full idvv string (id and version number)
out['ptoi'] = { x:i for i,x in enumerate(pids) } # pid to ix in X mapping
print("writing", Config.meta_path)
safe_pickle_dump(out, Config.meta_path)
# Find newest year of each conference
composed_conference_ids = set([(p['conf_id'], dateutil.parser.parse(p['published'])) for pid,p in db.items()])
composed_conference_ids = sorted(list(composed_conference_ids), key=lambda x: x[1], reverse=True)
newest_conf_years = {}
for conf_id, conf_year in composed_conference_ids:
conf_name = conf_id[:-4]
if newest_conf_years.get(conf_name) is None:
newest_conf_years[conf_name] = conf_year.strftime('%Y')
# Counts how many papers in the latest conferences are already in the top picks
def count_conference_papers(top_pids, newest_conf_years, db):
counters = {conf_name: 0 for conf_name in newest_conf_years}
for pid in top_pids:
conf_id = db[pid]['conf_id']
conf_name = conf_id[:-4]
conf_year = conf_id[-4:]
if db[pid]['conf_sub_id'].lower() == 'main' and conf_year == newest_conf_years[conf_name]:
counters[conf_name] += 1
return counters
# Find more top pids until all latest conferences have at least top_k_by_conf papers in the list
def get_top_pids_by_conference(top_k, top_k_by_conf, conf_counters, newest_conf_years, sort_idx, pid_confs, db):
sorted_pid_confs = [pid_confs[i] for i in sort_idx]
filtered_pid_confs = [
pc for pc in sorted_pid_confs[top_k:]
if pc.subid == 'main' and
pc.year == newest_conf_years[pc.name]]
conf_top_pids = []
for pc in filtered_pid_confs:
if conf_counters[pc.name] < top_k_by_conf:
conf_top_pids.append(pc.pid)
conf_counters[pc.name] += 1
return conf_top_pids
print("precomputing nearest neighbor queries in batches...")
X = X.todense() # originally it's a sparse matrix
sim_dict = {}
batch_size = 200
top_k = 500
top_k_by_conf = 50
for i in range(0, len(pids), batch_size):
i1 = min(len(pids), i+batch_size)
xquery = X[i:i1] # BxD
ds = -np.asarray(np.dot(X, xquery.T)) #NxD * DxB => NxB
IX = np.argsort(ds, axis=0) # NxB
for j in range(i1-i):
top_pids = [pids[q] for q in list(IX[1:top_k, j])]
conf_counters = count_conference_papers(top_pids, newest_conf_years, db)
conf_top_pids = get_top_pids_by_conference(top_k, top_k_by_conf, conf_counters, newest_conf_years, IX[:, j], pid_confs, db)
top_pids += conf_top_pids
sim_dict[pids[i+j]] = top_pids
print('%d/%d...' % (i, len(pids)))
print("writing", Config.sim_path)
safe_pickle_dump(sim_dict, Config.sim_path)