forked from JuliaLang/julia
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathregex.jl
134 lines (115 loc) · 4.21 KB
/
regex.jl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
## object-oriented Regex interface ##
include("pcre.jl")
type Regex
pattern::ByteString
options::Int32
regex::Array{Uint8}
extra::Ptr{Void}
function Regex(pat::String, opts::Integer, study::Bool)
pat = cstring(pat); opts = int32(opts)
if (opts & ~PCRE.OPTIONS_MASK) != 0
error("invalid regex option(s)")
end
re = PCRE.compile(pat, opts & PCRE.COMPILE_MASK)
ex = study ? PCRE.study(re) : C_NULL
new(pat, opts, re, ex)
end
end
Regex(p::String, s::Bool) = Regex(p, 0, s)
Regex(p::String, o::Integer) = Regex(p, o, false)
Regex(p::String) = Regex(p, 0, false)
copy(r::Regex) = r
# TODO: make sure thing are escaped in a way PCRE
# likes so that Julia all the Julia string quoting
# constructs are correctly handled.
macro r_str(pattern, flags...)
options = PCRE.UTF8
for fx in flags, f in fx
options |= f=='i' ? PCRE.CASELESS :
f=='m' ? PCRE.MULTILINE :
f=='s' ? PCRE.DOTALL :
f=='x' ? PCRE.EXTENDED :
error("unknown regex flag: $f")
end
Regex(pattern, options)
end
function show(io, re::Regex)
imsx = PCRE.CASELESS|PCRE.MULTILINE|PCRE.DOTALL|PCRE.EXTENDED
if (re.options & ~imsx) == PCRE.UTF8
print(io, 'r')
print_quoted_literal(io, re.pattern)
if (re.options & PCRE.CASELESS ) != 0; print(io, 'i'); end
if (re.options & PCRE.MULTILINE) != 0; print(io, 'm'); end
if (re.options & PCRE.DOTALL ) != 0; print(io, 's'); end
if (re.options & PCRE.EXTENDED ) != 0; print(io, 'x'); end
else
print(io, "Regex(")
show(io, re.pattern)
print(io, ',')
show(io, re.options)
print(io, ')')
end
end
# TODO: map offsets into non-ByteStrings back to original indices.
# or maybe it's better to just fail since that would be quite slow
type RegexMatch
match::ByteString
captures::Tuple
offset::Int
offsets::Vector{Int}
end
function show(io, m::RegexMatch)
print(io, "RegexMatch(")
show(io, m.match)
if !isempty(m.captures)
print(io, ", ")
for i = 1:length(m.captures)
print(io, i, "=")
show(io, m.captures[i])
if i < length(m.captures)
print(io, ", ")
end
end
end
print(io, ")")
end
matches(r::Regex, s::String, o::Integer) =
PCRE.exec(r.regex, r.extra, cstring(s), 0, o, false)
matches(r::Regex, s::String) = matches(r, s, r.options & PCRE.EXECUTE_MASK)
contains(s::String, r::Regex, opts::Integer) = matches(r,s,opts)
contains(s::String, r::Regex) = matches(r,s)
function match(re::Regex, str::ByteString, idx::Integer, opts::Integer)
m, n = PCRE.exec(re.regex, re.extra, str, idx-1, opts, true)
if isempty(m); return nothing; end
mat = str[m[1]+1:m[2]]
cap = ntuple(n, i->(m[2i+1] < 0 ? nothing : str[m[2i+1]+1:m[2i+2]]))
off = map(i->m[2i+1]+1, [1:n])
RegexMatch(mat, cap, m[1]+1, off)
end
match(r::Regex, s::String, i::Integer, o::Integer) = match(r, cstring(s), i, o)
match(r::Regex, s::String, i::Integer) = match(r, s, i, r.options & PCRE.EXECUTE_MASK)
match(r::Regex, s::String) = match(r, s, start(s))
function search(str::ByteString, re::Regex, idx::Integer)
len = length(str)
if idx >= len+2
return idx == len+2 ? (0,0) : error("index out of range")
end
opts = re.options & PCRE.EXECUTE_MASK
m, n = PCRE.exec(re.regex, re.extra, str, idx-1, opts, true)
isempty(m) ? (0,0) : (m[1]+1,m[2]+1)
end
search(s::ByteString, r::Regex) = search(s,r,start(s))
type RegexMatchIterator
regex::Regex
string::ByteString
overlap::Bool
end
start(itr::RegexMatchIterator) = match(itr.regex, itr.string)
done(itr::RegexMatchIterator, m) = m == nothing
next(itr::RegexMatchIterator, m) =
(m, match(itr.regex, itr.string, m.offset + (itr.overlap ? 1 : length(m.match))))
each_match(re::Regex, str::String, ovr::Bool) = RegexMatchIterator(re,str,ovr)
each_match(re::Regex, str::String) = RegexMatchIterator(re,str,false)
# miscellaneous methods that depend on Regex being defined
filter!(r::Regex, d::Dict) = filter!((k,v)->matches(r,k),d)
filter(r::Regex, d::Dict) = filter!(r,copy(d))