-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathfind_tokens.jl
205 lines (181 loc) · 7.19 KB
/
find_tokens.jl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
"""
find_tokens(s, templates)
Go through a text left to right, one (valid) char at the time and keep track
of sequences of chars that match specific tokens. The list of tokens found is
returned.
## Arguments
* `s`: the initial text
* `templates`: dictionary of possible tokens
## Errors
This should not throw any error, everything should be explicitly handled by
a code path.
"""
function find_tokens(
s::SS,
templates::Dict{Char, Vector{Pair{TokenFinder, Symbol}}}
)::Vector{Token}
tokens = Token[]
isempty(s) && return tokens
# Put a "start of string" token first, note that it may overlap with another
# token which would be right at the start too.
push!(
tokens,
Token(:SOS, subs(parent_string(s), from(s)))
)
head_idx = firstindex(s)
end_idx = lastindex(s)
@inbounds while head_idx <= end_idx
head_char = s[head_idx]
if haskey(templates, head_char)
# Look at each possible finder sequentially
for (tf, case) in templates[head_char]
# ------------------------------------
# exact match of a given fixed pattern
# --> we form a candidate substring with a fixed number of characters
# and try to see if it matches a fixed rule. Possibly the substring
# contains an extra character for rules where we must match only when
# the next character is or isn't something.
if (tf.steps >= 0)
tail_idx = nextind(s, head_idx, tf.steps)
at_eos = false
if tail_idx == nextind(s, end_idx)
tail_idx = end_idx
at_eos = true
end
(tail_idx > end_idx) && continue
# if there is space, consider the substring and verify whether it matches
candidate = subs(s, head_idx, tail_idx)
matches, offset = fixed_lookahead(tf, candidate, at_eos)
# if it matches, form the token and break the for loop: no need to check
# other cases.
if matches
head_idx = prevind(s, tail_idx, offset)
push!(
tokens,
Token(case, chop(candidate, tail=offset))
)
break
end
# -----------------------------------------
# rule-based match: greedy catch until fail
# --> we gradually form a candidate substring of increasing length until
# the next character doesn't meet the condition.
else
nchars = 1
tail_idx = head_idx
probe_idx = nextind(s, head_idx)
probe_idx > end_idx && continue
probe_char::Char = s[probe_idx]
# while the condition holds, consume get next char
while greedy_lookahead(tf, nchars, probe_char)
tail_idx = probe_idx
probe_idx = nextind(s, probe_idx)
(probe_idx > end_idx) && break
probe_char = s[probe_idx]
nchars += 1
end
# if we took in at least a char, validate then form the token
if tail_idx > head_idx
candidate = subs(s, head_idx, tail_idx)
# check if the backward validator is happy otherwise skip
check(tf, candidate) || continue
# if it's happy move head and push the token
head_idx = tail_idx
push!(
tokens,
Token(case, candidate)
)
break
end
end
end
end
head_idx = nextind(s, head_idx)
end
# finally push the end token on the stack, note that it can overlap another
# token that would be at the end of the string, same as :SOS token.
push!(
tokens,
Token(:EOS, subs(s, end_idx))
)
# discard header tokens that are not at the start of a line or
# only preceded by whitespaces
process_header_tokens!(tokens)
# validate or drop emphasis tokens
process_emphasis_tokens!(tokens)
# discard autolink_close tokens which are preceded by a space
process_autolink_close_tokens!(tokens)
return tokens
end
find_tokens(s::String, templates) = find_tokens(subs(s), templates)
"""
process_header_tokens!(tokens)
Discard header tokens that are not at the start of a line or only preceded by
whitespaces.
"""
function process_header_tokens!(tokens::Vector{Token})
remove = Int[]
@inbounds for (i, t) in enumerate(tokens)
if t.name in MD_HEADERS
ss = until_previous_line_return(t)
isempty(strip(ss)) || push!(remove, i)
end
end
deleteat!(tokens, remove)
end
"""
process_emphasis_tokens!(tokens)
Process emphasis token candidates and either take them or discard them if
they don't look correct.
* `sTs` with token `T` is `s` is a space
* `xTs` with token `T` is a valid CLOSE if `x` is a character and `s` a space
* `sTx` with token `T` is a valid OPEN if `x` is a character and `s` a space
* `xTy` with token `T` is a valid MIXED if `x`, `y` are characters
"""
function process_emphasis_tokens!(tokens::Vector{Token})
isempty(tokens) && return
remove = Int[]
ps = parent_string(first(tokens))
N = lastindex(ps)
@inbounds for (i, t) in enumerate(tokens)
if t.name in (:EM, :STRONG, :EM_STRONG)
prev_char = previous_chars(t)
next_char = next_chars(t)
# if the token is surrounded by spaces, discard it
bad = !isempty(prev_char) && first(prev_char) in SPACE_CHAR &&
!isempty(next_char) && first(next_char) in SPACE_CHAR
# sTs
if bad
push!(remove, i)
# Tx or sTx
elseif isempty(prev_char) || first(prev_char) in SPACE_CHAR
n = Symbol(string(t.name) * "_OPEN")
tokens[i] = Token(n, t.ss)
# xT or xTs
elseif isempty(next_char) || first(next_char) in SPACE_CHAR
n = Symbol(string(t.name) * "_CLOSE")
tokens[i] = Token(n, t.ss)
else # xTy
n = Symbol(string(t.name) * "_MX")
tokens[i] = Token(n, t.ss)
end
end
end
deleteat!(tokens, remove)
return
end
"""
process_autolink_close_tokens!(tokens)
Discard :AUTOLINK_CLOSE that are preceded by a space.
"""
function process_autolink_close_tokens!(tokens::Vector{Token})
isempty(tokens) && return
remove = Int[]
@inbounds for (i, t) in enumerate(tokens)
t.name == :AUTOLINK_CLOSE || continue
c = previous_chars(t)
(isempty(c) || first(c) ∈ SPACE_CHAR) && push!(remove, i)
end
deleteat!(tokens, remove)
return
end