-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathquote_searcher.py
112 lines (95 loc) · 3.35 KB
/
quote_searcher.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
import gzip
import os
import datetime as dt
from match_quotes import QuoteMatcher
import cPickle
'''
searches for quotes matching transcripts in a spinn3r data directory, returning mentions.
'''
TRANSCRIPT_TIMEFORMAT = "%Y-%m-%d %H:%M"
NEWS_TIMEFORMAT = "%Y-%m-%d %H:%M:%S"
HYPHEN_TYPES = ["\xe2\x80\x94", " - ", " -- "]
class QuoteSearcher:
MENTION_DUMPNAME = "mentions"
def __init__(self, transcript_dir, dump_dir, cache_dir = 'cache', tolerance = -0.4, bag_tolerance = 0.7):
'''
Arguments:
transcript_dir (str): directory containing transcripts. Transcripts are of form:
Transcript title
Transcript timestamp (%Y-%m-%d %H:%M)
Transcript text
dump_dir (str): directory to dump matches found
cache_dir (str): directory to dump cache of quote matches
tolerance (float, optional, default=-.4): minimum similarity score for match
bag_tolerance (float, optional, default=.7): minimum proportion of words
in quote that must be present in a transcript for alignment process to continue
'''
self.qm = QuoteMatcher(transcript_dir, cache_dir, tolerance, bag_tolerance)
self.dump_dir = dump_dir
self.mentions = []
self.read_files = set()
self.errors = 0
def read_file(self, filename, reread=False, cache_matches=True):
'''
Reads one spinn3r file, matching all quotes from that file to transcripts.
Arguments:
filename (str): spinn3r filename
reread (bool, optional, default=False): if True, will reread a file we've already read.
(this is mostly for development: when program crashes on a new file, we don't really
want to reread old ones.)
cache_matches (bool, optional, default=True): if True, will save to disk the cache of quote matches.
'''
if reread or filename not in self.read_files:
print "Reading "+filename
with gzip.open(filename, 'rb') as f:
linecount = 0
for line in f:
article = self.read_article(line)
self.match_quotes(article)
linecount += 1
#if linecount % 200 == 0:
# print "Read "+str(linecount) + " lines"
self.read_files.add(filename)
print "Finished reading. Caching matches..."
if cache_matches:
self.qm.dump_all()
def read_article(self, line):
'''
Reads single article, returning article dict.
'''
article_dict = eval(line) # hmmm
strdate = article_dict['date']
article_dict['date'] = dt.datetime.strptime(strdate, NEWS_TIMEFORMAT)
quotes = []
raw_quote_list = article_dict['quotes']
for elem in raw_quote_list:
quotes.append(elem['quote'])
article_dict['quotes'] = quotes
return article_dict
def dump_mentions(self, dump_filename=None):
'''
Writes all matches found as pickle to dump_filename.
'''
if dump_filename is None:
dump_filename = self.MENTION_DUMPNAME
mentions_dump = os.path.join(self.dump_dir, dump_filename)
print "Dumping mentions into "+mentions_dump
if not os.path.exists(self.dump_dir):
os.mkdir(self.dump_dir)
with open(mentions_dump, 'wb') as f:
cPickle.dump(self.mentions, f)
print "Finished dumping mentions"
def match_quotes(self, article):
'''
matches all quotes in an article.
'''
timestamp = article['date']
quotes = article['quotes']
for quote in quotes:
try:
match_result = self.qm.match_quote(quote, timestamp)
if match_result[0]:
self.mentions.append((quote, match_result[1], article))
except:
print "Ack"
self.errors +=1