-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathgroup_quotes.py
95 lines (86 loc) · 3.35 KB
/
group_quotes.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
from collections import defaultdict
'''
some utility functions for grouping quotes into families.
Grouping quotes works as follows:
1. for each transcript, quotes aligned to that transcript
are sorted in order of their position in the transcript.
2. for each quote, we see how many words overlap between that quote
and the previous one.
3. if the overlap is above some minimum threshold, we put that quote in the same group.
else we form a new quote group.
'''
def get_first_pos(seq):
'''
gets the first positive value in a sequence.
'''
first_pos = seq[0]
index = 0
while (first_pos < 0):
first_pos = seq[index]
index += 1
return first_pos
def group_quotes(quotelist, family_start_id, min_overlap = 5):
'''
groups a list of quotes aligned to the same transcript.
arguments:
quotelist (list of alignments): list of quotes, represented as alignments.
family_start_id (int): the group id to start counting from
min_overlap (int, default=5): minimum overlap between two quotes such that
they are put in the same group.
returns:
family_assigns (list of int): list of quote group ids, in order of quotes listed
in quotelist argument
curr_id (int): last group id assigned to a quote.
'''
family_assigns = []
curr_id = family_start_id
prevquote = quotelist[0]
extent = max(prevquote[-1]) #we (somewhat arbitrarily) suppose the quote is represented by its last segment.
for q in quotelist:
overlen = sum([x <= extent and x >= 0 for x in q[-1]])
if overlen < min_overlap:
curr_id += 1
family_assigns.append(curr_id)
prevquote = q
extent = max(extent, max(q[-1]))
return family_assigns, curr_id
def get_transcript_to_quotelist(alignment_list):
'''
returns a dict of transcript name to list of quotes aligned to that transcript
'''
transcript_to_quotelist = defaultdict(set)
for q in alignment_list:
transcript = q[1]
quote = q[0]
transcript_to_quotelist[transcript].add(quote)
for transcript, quotelist in transcript_to_quotelist.iteritems():
transcript_to_quotelist[transcript] = sorted(quotelist,
key=lambda x: get_first_pos(x[-1]))
return transcript_to_quotelist
def group_all(transcript_to_quotelist, transcript_order, min_overlap = 5, start_id = 0):
'''
groups quotes into families of similar quotes.
arguments:
transcript_to_quotelist (dict): output of get_transcript_to_quotelist
transcript_order: list of transcript names in order of increasing timestamp
min_overlap
start_id (int): the value of the first group id to be assigned.
returns:
alignment_to_family_id: dict of quote alignments (as tuple(index, transcriptname))
to group id
family_id_to_alignments: dict of group id to set of quotes, represented as alignments,
in that group.
'''
alignment_to_family_id = {}
family_id_to_alignments = defaultdict(set)
f_id = start_id
for transcript in transcript_order:
quotelist = transcript_to_quotelist[transcript]
if len(quotelist) == 0:
continue
f_assigns, curr_id = group_quotes(quotelist, f_id)
for quote, assign in zip(quotelist, f_assigns):
alignment_to_family_id[(quote, transcript)] = assign
family_id_to_alignments[assign].add((quote, transcript))
f_id = curr_id + 1
return alignment_to_family_id, family_id_to_alignments