-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathumiaq_split.py
266 lines (227 loc) · 10.3 KB
/
umiaq_split.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
import re
from typing import List, Dict, Tuple
NAMED_GROUP_PATTERN = r"\(\?P<\w+>.*?\)"
NAMED_GROUP_COMPILED = re.compile(r"\(\?P<(\w+)>")
BACKREF_PATTERN = r"\\\d+"
BACKREF_COMPILED = re.compile(r"\\(\d+)")
CAPITAL_GREEK_LETTERS = [None, "Γ", "Δ", "Λ", "Ξ", "Π", "Σ", "Φ", "Ψ", "Ω", "Θ"]
def extract_backrefs(regex):
"""
Extract backreferences from a regex pattern
return a dictionary mapping backreference indices to their referenced group names
"""
named_groups = {i+1: name for i, name in enumerate(NAMED_GROUP_COMPILED.findall(regex))}
backrefs = {}
for m in BACKREF_COMPILED.finditer(regex):
backref_index = int(m.group(1))
if backref_index in named_groups:
backrefs[backref_index] = named_groups[backref_index]
return backrefs
def split_named_regex(pattern: str):
"""
Split a regex pattern into an array where named groups and other parts
are separate elements.
:param pattern: The regex pattern to split.
:return: A list of strings with named groups and other components.
"""
# Regex to match named groups like (?P<name>...)
pattern_for_names = f"({NAMED_GROUP_PATTERN})|({BACKREF_PATTERN})"
pattern_for_split = r"(\(\?P<\w+>.*?\)|\\\d+)"
# Find all named groups
named_groups = re.findall(pattern_for_names, pattern)
named_groups = [item for pair in named_groups for item in pair if item]
# Split the pattern at the named groups, keeping the separators
result = re.split(pattern_for_split, pattern)
# Filter out components that explicitly match, but keep empty strings
parts = [part for part in result if part != None and not re.fullmatch(pattern_for_split, part)]
# Interleave the split parts and the named groups
result = []
for part, group in zip(parts, named_groups + [""]): # Add an empty string to handle the last split
if part:
result.append(part)
if group:
result.append(group)
return result
def parse_pattern(pattern):
"""
Parse a regex into "group" and "fixed" components
Return an array of "group" or "fixed" components
and optionally a "backref" component
"""
# Pull out backrefs, if any
backrefs = extract_backrefs(pattern)
# split the regex
split_regex = split_named_regex(pattern)
arr = []
named = ''
for patt in split_regex:
if re.fullmatch(NAMED_GROUP_PATTERN, patt):
named += patt[4]
elif BACKREF_COMPILED.fullmatch(patt):
letter = CAPITAL_GREEK_LETTERS[int(BACKREF_COMPILED.fullmatch(patt).group(1))]
named += letter
else:
if named:
arr.append(('group', named))
named = ''
arr.append(('fixed', patt.upper()))
if named:
arr.append(('group', named))
backrefs_greek = {CAPITAL_GREEK_LETTERS[k]: v for k, v in backrefs.items()}
return arr, backrefs_greek
def find_fixed_positions(word: str, fixed_components: List[str]) -> List[List[Tuple[int, int]]]:
"""
Find all start and end positions of fixed regex components in the word, including overlapping matches.
:param word: The input word.
:param fixed_components: List of fixed regex patterns.
:return: A list of lists, where each inner list contains tuples of (start, end)
positions of matches for a fixed regex component, including overlapping matches.
"""
positions = []
for component in fixed_components:
regex = re.compile(component)
matches = []
# Manually check for overlapping matches
for i in range(len(word)):
match = regex.match(word[i:])
if match:
matches.append((i, i + len(match.group())))
positions.append(matches)
return positions
def find_regex_partitions_fixed(word: str, components: List[Tuple[str, str]], lengths: dict = {}, backrefs: dict = {}) -> List[Dict[str, str]]:
"""
Partition a word into substrings matching a parsed regex pattern with mixed group and fixed components,
ensuring fixed components fully match before proceeding to the next group.
:param word: The input word to partition.
:param components: Parsed components of the regex pattern.
:return: A list of dictionaries with matched named groups.
"""
# Separate fixed and group components
fixed_components = [(i, c[1]) for i, c in enumerate(components) if c[0] == 'fixed']
group_components = [(i, c[1]) for i, c in enumerate(components) if c[0] == 'group']
# get the "group" letters
group_letters = frozenset([_[1] for _ in group_components])
# Precompute positions of fixed components with overlapping matches
fixed_positions = find_fixed_positions(word, [fc[1] for fc in fixed_components])
# Ensure the positions are sequential
valid_combinations = []
def generate_combinations(index=0, current=[]):
if index == len(fixed_positions):
valid_combinations.append(current[:])
return
last_position = current[-1][1] if current else -1
for start, end in fixed_positions[index]:
if start > last_position: # Ensure sequential order
current.append((start, end))
generate_combinations(index + 1, current)
current.pop()
generate_combinations()
# Build partitions from valid combinations
results = []
for combination in valid_combinations:
partition = {}
current_start = 0
group_index = 0
for fixed_index, (start, end) in enumerate(combination):
# Assign named groups before the current fixed component
while group_index < len(group_components) and group_components[group_index][0] < fixed_components[fixed_index][0]:
group_name = group_components[group_index][1]
if current_start < start: # Ensure non-empty group substrings
partition[group_name] = word[current_start:start]
current_start = start
group_index += 1
else:
break
# Move past the fixed component
fixed_name = f"fixed_{fixed_index}"
partition[fixed_name] = word[start:end]
current_start = end
# Assign any remaining named groups
if group_index < len(group_components):
for _, group_name in group_components[group_index:]:
if current_start < len(word): # Ensure non-empty trailing groups
partition[group_name] = word[current_start:]
current_start = len(word)
# add the word itself
#partition["word"] = word
results.append(partition)
# Filter out invalid results
def is_valid_partition(partition):
# Concatenate all values and compare to the original word
reconstructed = ''.join(partition.values())
if reconstructed != word:
return False
# also check that all group letters are keys
return group_letters.issubset(set(partition.keys()))
results = [partition | {"word": word} for partition in results if is_valid_partition(partition)]
# If we have any "multiple" named groups, split them
final_results = results
multiple_groups = [_[1] for _ in group_components if len(_[1]) > 1]
if multiple_groups:
final_results = []
for group_name in multiple_groups:
for r in results:
num_partitions = len(group_name)
if len(r[group_name]) >= num_partitions:
this_str = r.pop(group_name, None)
for p in generate_partitions(this_str, num_partitions):
d = dict((group_name[i], p[i]) for i in range(num_partitions))
d.update(r)
final_results.append(d)
# Check that the lengths line up
final_results2 = final_results
if lengths:
final_results2 = []
for k, v in lengths.items():
for part in final_results:
if len(part[k]) == v:
final_results2.append(part)
# We need to do a check for backreferences
if not backrefs:
return final_results2
actual_final_results = []
for k, v in backrefs.items():
for part in final_results2:
if part.get(k) == part.get(v):
actual_final_results.append(part)
return actual_final_results
def split_word_on_pattern(word, pattern):
regex = pattern.regex
# Assume if a variable has a length, it is fixed
# TODO: do we need to change this down the road?
lengths = dict((k, v[0]) for k, v in pattern.lengths.items() if re.fullmatch(r'[A-Z]', k))
# if there are no named components, return something simple
if not re.search(NAMED_GROUP_PATTERN, regex):
return [{"word": word}]
parsed_components, backrefs = parse_pattern(regex)
result = find_regex_partitions_fixed(word, parsed_components, lengths=lengths, backrefs=backrefs)
return result
def generate_partitions(string: str, num_partitions: int) -> List[List[str]]:
"""
Generate all non-zero-length partitions of a string into a given number of partitions.
:param string: The input string to partition.
:param num_partitions: The number of partitions to generate.
:return: A list of partitions, each represented as a list of substrings.
"""
def helper(start: int, partitions: List[str]):
# If we've filled the required number of partitions
if len(partitions) == num_partitions - 1:
# The last partition takes the remaining substring
remaining = string[start:]
if remaining: # Ensure it's non-zero-length
results.append(partitions + [remaining])
return
# Iterate over possible splits for the current partition
for end in range(start + 1, len(string) - (num_partitions - len(partitions) - 1) + 1):
helper(end, partitions + [string[start:end]])
results = []
helper(0, [])
return results
#%%
if __name__ == '__main__':
# Example usage
word = "queueing".lower()
pattern = r"(?P<A>.+)[aeiou]{3}(?P<C>.+)"
result = split_word_on_pattern(word, pattern)
for match in result:
print(match)