-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathpostfilter_matches.py
79 lines (66 loc) · 2.21 KB
/
postfilter_matches.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
from __future__ import division
import cPickle
import tldextract
'''
gets rid of not real matches and wn.com articles. it works ok.
'''
RAW_MENTION_OUTPUT = 'output/raw_mentions' #unfiltered mentions, a pickle
POSTFILTER_OUTPUT = 'output/postfiltered_mentions' #where to write filtered mentions, also a pickle
def short_seg_filter(segs, thresh = .5):
'''
keeps quote (i.e. returns True) if for all segments, at least proportion=threshold
of words are actually in the transcript it was matched to.
'''
keep = True
for x in segs:
if sum([y==-1 for y in x])/len(x) >= thresh:
keep = False
return keep
def short_quote_filter(segs, score, thresh_len = 6):
'''
filters out short quotes which do not match perfectly.
'''
if len(segs) > 1:
return True
else:
seg = segs[0]
if len(seg)>thresh_len:
return True
else:
if sum([y >= 0 for y in seg]) == len(seg):
return True
else:
return score >= 0
def score_filter(segs, score, thresh = -.37):
'''
filters out low-similarity alignments.
'''
if len(segs) > 1:
return True
else:
return score > thresh
def ratio_filter(segs, thresh = .3, small_thresh = 7):
'''
removes quote if a large proportion of words aren't actually
present in transcript.
'''
if len(segs) > 1:
return True
else:
ratio = sum([y==-1 for y in segs[0]])/len(segs[0])
if ratio > thresh:
return False
else:
if len(segs[0]) > small_thresh:
return True
else:
return ratio == 0
with open(RAW_MENTION_OUTPUT) as f:
mentions = cPickle.load(f)
filtered_by_segments = [x for x in mentions if short_seg_filter(x[1][0])]
filtered_by_perfshorts = [x for x in filtered_by_segments if short_quote_filter(x[1][0],x[1][2])]
filtered_for_score = [x for x in filtered_by_perfshorts if score_filter(x[1][0],x[1][2])]
filtered_for_ratio = [x for x in filtered_for_score if ratio_filter(x[1][0])]
filtered_for_wn = [x for x in filtered_for_ratio if tldextract.extract(x[2]['url']).domain != 'wn']
with open(POSTFILTER_OUTPUT,'w') as f:
cPickle.dump(filtered_for_wn, f)