-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathmatch_quotes.py
160 lines (130 loc) · 5.63 KB
/
match_quotes.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
from __future__ import division
import datetime as dt
import os, string, collections, cPickle, bisect
import match_utils as mu
''' aligns quotes to a set of texts.
'''
class QuoteMatcher:
'''
Aligns quotes to a set of texts.
'''
PHRASEMATCH_CACHENAME = 'phrase_matches'
MIN_LEN = 6 # minimum word count of quote
MAX_INTERVAL = dt.timedelta(days=14) # max time interval between transcript and quote
GAP_PEN = -1 # gap penalty for needleman-wunsch
SUB_PEN = -1 # substitution penalty for needleman-wunsch
ACCEPT_THRESHOLD = -0.1
def __init__(self, transcript_dir, cache_dir, sim_tolerance = -0.4, bag_tolerance = 0.7):
'''
Arguments:
transcript_dir (str): directory containing speech transcripts.
Transcripts should be of format:
Transcript title
Transcript timestamp
Transcript text
cache_dir (str): directory to write cache output
sim_tolerance (float, default=-.4): minimum similarity of valid quote-transcript match
bag_tolerance (float, default=.7): minimum proportion of words in quote that must be
present in transcript for string alignment process to occur.
'''
self.sim_tolerance = sim_tolerance
self.bag_tolerance = bag_tolerance
self.cache_dir = cache_dir
transcript_order, self.transcript_text = mu.fetch_transcripts(transcript_dir)
self.transcript_times = [x[0] for x in transcript_order]
self.transcript_names = [x[1] for x in transcript_order]
self.phrasematch_cache = self.load_cached_dict(self.PHRASEMATCH_CACHENAME)
def load_cached_dict(self, cache_filename):
'''
Loads a phrasematch cache from cache_filename, a pickle file.
'''
cache_path = os.path.join(self.cache_dir, cache_filename)
if os.path.isfile(cache_path):
print "Loading "+cache_filename
with open(cache_path, 'rb') as f:
return cPickle.load(f)
else:
return {}
def dump_cached(self, cached, cache_filename):
'''
Dumps phrasematch cache to a pickle file.
'''
print "Dumping " + cache_filename
if not os.path.exists(self.cache_dir):
os.mkdir(self.cache_dir)
with open(os.path.join(self.cache_dir, cache_filename), 'wb') as f:
cPickle.dump(cached, f)
print "Finished dumping " + cache_filename
def dump_all(self):
self.dump_cached(self.phrasematch_cache, self.PHRASEMATCH_CACHENAME)
# return: (bool match, (list(list) alignment, string transcriptname, score)). the tup is None if no match found.
def match_quote(self, quote, quote_timestamp):
'''
Aligns a quote to a set of transcripts.
Arguments:
quote (str): quote to match
quote_timestamp (datetime): timestamp of quote
Returns:
bool: True if match found (i.e. quote similarity score above threshold), False else.
tuple containing:
alignment: list of list of ints, best alignment for quote
transcript name (str): transcript filename that quote aligned to
score (float): alignment score
or None if bool==False.
The precise process works as follows:
1. check if quote is above minimum word length
2. find transcripts within maximum time interval that occur before quote
3. check if there is already an alignment for a quote (or an entry saying the
quote couldn't be aligned) in the cache
4. check if a sufficiently high proportion of words in the quote are also
present in the transcript
5. align the quote to the transcript, and save the alignment to the cache.
'''
# check if quote is above min len
quote_array = mu.convert_to_match_array(quote)
trial_length = len(quote_array)
if trial_length >= self.MIN_LEN:
# search for transcripts within time window.
latest_transcript_index = bisect.bisect_left(self.transcript_times, quote_timestamp) - 1
earliest_transcript_index = bisect.bisect_left(self.transcript_times, quote_timestamp - self.MAX_INTERVAL)
search_range = range(latest_transcript_index, earliest_transcript_index - 1, -1)
if latest_transcript_index < 0 or earliest_transcript_index >= len(self.transcript_times):
return (False, None)
else:
best_align = None
best_transcript = None
best_score = None
for i in search_range:
curr_transcript_name = self.transcript_names[i]
# first see if the entire quote is already cached.
cached_align = self.phrasematch_cache.get((quote, curr_transcript_name), None)
if cached_align is not None:
if cached_align[0]:
if cached_align[1][1] > best_score:
best_align, best_score = cached_align[1]
best_transcript = curr_transcript_name
else:
# check if enough words are present in the transcript that we should even bother looking
in_set_count = sum([(word in self.transcript_text[curr_transcript_name]['bag'])
for word in quote_array])
if in_set_count / trial_length < self.bag_tolerance:
self.phrasematch_cache[(quote, curr_transcript_name)] = (False, None)
else:
#now we actually have to work hard
#so that the cache isn't overwhelmed with tiny segments, we won't cache segments.
align = mu.align(quote, self.transcript_text[curr_transcript_name]['raw'],
self.transcript_text[curr_transcript_name]['match'], self.SUB_PEN, self.GAP_PEN)
if align[1] >= self.sim_tolerance:
self.phrasematch_cache[(quote, curr_transcript_name)] = (True, align)
if align[1] > best_score:
best_align = align[0]
best_score = align[1]
best_transcript = curr_transcript_name
else:
self.phrasematch_cache[(quote, curr_transcript_name)] = (False, None)
if best_score >= self.sim_tolerance:
return (True, (best_align, best_transcript, best_score))
else:
return (False, None)
else:
return False, None