-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathpcre_intf.cpp
275 lines (256 loc) · 10.7 KB
/
pcre_intf.cpp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
//
// Copyright 2015-2021 by Kevin L. Goodwin [[email protected]]; All rights reserved
//
// This file is part of K.
//
// K is free software: you can redistribute it and/or modify it under the
// terms of the GNU General Public License as published by the Free Software
// Foundation, either version 3 of the License, or (at your option) any later
// version.
//
// K is distributed in the hope that it will be useful, but WITHOUT ANY
// WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
// FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
// details.
//
// You should have received a copy of the GNU General Public License along
// with K. If not, see <http://www.gnu.org/licenses/>.
//
#include "ed_main.h"
#include "ed_search.h"
#include <pcre2.h>
//----------- CompiledRegex
// For simple Regex string searches (vs. search-thru-file-until-next-match) ops, use Regex_Compile + CompiledRegex::Match
class CompiledRegex {
NO_ASGN_OPR(CompiledRegex);
NO_COPYCTOR(CompiledRegex);
private:
pcre2_code *d_pcreCode;
pcre2_match_data *d_pcreMatchData;
public:
// User code SHOULD NOT call this ctor, _SHOULD_ CREATE CompiledRegex via Regex_Compile!
CompiledRegex( pcre2_code *pcreCode, pcre2_match_data *pcreMatchData ) // called ONLY by Regex_Compile (when it is successful)
: d_pcreCode(pcreCode)
, d_pcreMatchData( pcreMatchData )
{
}
~CompiledRegex() {
pcre2_code_free( d_pcreCode );
pcre2_match_data_free( d_pcreMatchData );
}
RegexMatchCaptures::size_type Match( RegexMatchCaptures &captures, stref haystack, COL haystack_offset, int pcre_exec_options );
};
void PCRE_API_INIT() {
STATIC_VAR BoolOneShot first;
if( first ) {
}
}
stref RegexVersion() {
STATIC_VAR char s_RegexVer[31] = { '\0', };
if( '\0' == s_RegexVer[0] ) {
char pcre2_version[25];
pcre2_config( PCRE2_CONFIG_VERSION, pcre2_version ); 1 && DBG( "PCRE version '%s'", pcre2_version );
safeSprintf( BSOB(s_RegexVer), "PCRE %s", pcre2_version );
}
return s_RegexVer;
}
//------------------------------------------------------------------------------
int DbgDumpCaptures( RegexMatchCaptures &captures, PCChar tag ) {
auto ix(0);
for( const auto &el : captures ) {
DBG( "%s[%d] '%" PR_BSR "'", tag, ix++, BSR(el.valid() ? el.value() : stref("-1")) );
}
return 1;
}
RegexMatchCaptures::size_type CompiledRegex::Match( RegexMatchCaptures &captures, stref haystack, COL haystack_offset, int pcre_exec_options ) {
0 && DBG( "CompiledRegex::Match called!" );
captures.clear(); // before any return
// http://www.pcre.org/original/doc/html/pcreapi.html#SEC17 "MATCHING A PATTERN: THE TRADITIONAL FUNCTION" describes pcre_exec()
const int rc( pcre2_match(
d_pcreCode
, reinterpret_cast<PCRE2_SPTR>( haystack.data() )
, haystack.length() // length
, haystack_offset // startoffset
, pcre_exec_options // options
, d_pcreMatchData
, nullptr // pcre2_match_context * (nullptr == use default behaviors)
)
); 0 && DBG( "CompiledRegex::Match returned %d", rc );
if( rc <= 0 ) {
switch( rc ) {
break;case PCRE2_ERROR_NOMATCH: // the only "expected" error: be silent
break;default: {
uint8_t errMsg[150];
if( PCRE2_ERROR_BADDATA == pcre2_get_error_message( rc, BSOB(errMsg) ) ) {
ErrorDialogBeepf( "pcre2_match returned unknown error %d", rc );
}
else {
ErrorDialogBeepf( "pcre2_match returned '%s'?", errMsg );
}
}
}
}
else { 0 && DBG( "CompiledRegex::Match count=%d", rc );
// The first pair of integers, ovector[0] and ovector[1], identify the portion of the subject string matched by
// the entire pattern (Perl's $0). The next pair is used for the first capturing subpattern (Perl's $1), and so
// on. The value returned by pcre2_match() is one more than the highest numbered pair that has been set. For
// example, if two substrings have been captured, the returned value is 3. If there are no capturing subpatterns,
// the return value from a successful match is 1, indicating that just the first pair of offsets has been set.
//
const auto ovector_els = pcre2_get_ovector_count( d_pcreMatchData );
captures.reserve( ovector_els );
auto ovector = pcre2_get_ovector_pointer( d_pcreMatchData );
for( std::remove_const_t<decltype(ovector_els)> ix{0}; ix < ovector_els; ++ix ) {
const auto oFirst = *ovector++;
const auto oPastLast = *ovector++;
// It is possible for an capturing subpattern number n+1 to match some part of the subject when subpattern n
// has not been used at all. For example, if the string "abc" is matched against the pattern (a|(z))(bc)
// subpatterns 1 and 3 are matched, but 2 is not. When this happens, both offset values corresponding to the
// unused subpattern are set to PCRE2_UNSET.
//
if( oFirst == PCRE2_UNSET && oPastLast == PCRE2_UNSET ) {
captures.emplace_back();
}
else {
captures.emplace_back( oFirst, stref( haystack.data() + oFirst, oPastLast - oFirst ) );
}
}
}
return captures.size();
}
RegexMatchCaptures::size_type Regex_Match( CompiledRegex *pcr, RegexMatchCaptures &captures, stref haystack, COL haystack_offset, int pcre_exec_options ) {
return pcr->Match( captures, haystack, haystack_offset, pcre_exec_options );
}
CompiledRegex *Regex_Delete0( CompiledRegex *pcr ) {
delete pcr;
return nullptr;
}
CompiledRegex *Regex_Compile( stref pszSearchStr, bool fCase ) { 0 && DBG( "Regex_Compile! %" PR_BSR, BSR(pszSearchStr) );
PCRE_API_INIT();
const int options( fCase ? 0 : PCRE2_CASELESS );
int errCode;
PCRE2_SIZE errOffset;
auto pcreCode( pcre2_compile( reinterpret_cast<PCRE2_SPTR>(pszSearchStr.data()), pszSearchStr.length(), options, &errCode, &errOffset, nullptr ) );
if( !pcreCode ) {
uint8_t errMsg[150];
if( PCRE2_ERROR_BADDATA == pcre2_get_error_message( errCode, BSOB(errMsg) ) ) {
Msg( "pcre2_compile returned unknown error %d", errCode );
return nullptr;
}
0 && DBG( "Regex_Compile! Display_hilite_regex_err" );
Display_hilite_regex_err( reinterpret_cast<PCChar>(errMsg), pszSearchStr, errOffset );
return nullptr;
}
auto pcreMatchData = pcre2_match_data_create_from_pattern( pcreCode, nullptr );
if( !pcreMatchData ) {
pcre2_code_free( pcreCode );
Msg( "pcre2_match_data_create_from_pattern returned NULL" );
return nullptr;
}
return new CompiledRegex( pcreCode, pcreMatchData );
}
//
// GenericListEl was developed for encoding regex replacement strings which can
// include backreferences; an IS_INT variant signifies that "the content of the
// backreference numbered N" is to be appended (IS_STR values are appended
// directly).
//
// Since the implementation is reasonably memory efficient (one alloc per
// element, even for strings, with additional overhead being only the link
// pointers plus sizeof(enum) (the minimal extra cost of including the IS_INT
// variant)), this data struct is useful for general string concatenation as
// well as its originally-intended purpose.
//
struct GenericListEl {
DLinkEntry<GenericListEl> dlink;
enum { IS_INT, IS_STRING } d_typeof;
union {
int num;
char str[ sizeof(int) ];
} value;
bool IsStr( const char **ppStr ) const {
*ppStr = d_typeof == IS_STRING ? value.str : nullptr;
return d_typeof == IS_STRING;
}
bool IsInt( int *pInt ) const {
*pInt = d_typeof == IS_INT ? value.num : 0;
return d_typeof == IS_INT;
}
};
struct GenericList {
DLinkHead<GenericListEl> d_head;
GenericList() {}
virtual ~GenericList();
void Cat( int num );
void Cat( PCChar src, size_t len );
};
GenericList::~GenericList() {
while( auto pEl = d_head.front() ) {
DLINK_REMOVE_FIRST( d_head, pEl, dlink );
Free0( pEl );
}
}
void GenericList::Cat( int num ) {
GenericListEl *rv;
AllocArrayNZ( rv, 1 );
rv->dlink.clear();
rv->d_typeof = GenericListEl::IS_INT;
rv->value.num = num;
DLINK_INSERT_LAST(d_head, rv, dlink);
}
void GenericList::Cat( PCChar src, size_t len ) {
if( len == 0 ) {
len = Strlen( src );
}
GenericListEl *rv = static_cast<GenericListEl *>( // cannot use auto due to 'sizeof( *rv )'
AllocNZ_(
sizeof( *rv ) // needed control struct
+ len+1 // space to store string value
- sizeof(rv->value) // space within control struct that is used to store string value
)
);
rv->dlink.clear();
rv->d_typeof = GenericListEl::IS_STRING;
memcpy( rv->value.str, src, len );
rv->value.str[len] = '\0';
DLINK_INSERT_LAST(d_head, rv, dlink);
}
struct ReplaceWithCaptures : public GenericList {
ReplaceWithCaptures( PCChar src );
};
ReplaceWithCaptures::ReplaceWithCaptures( PCChar src ) {
auto p ( src );
const auto end ( src + Strlen(p) );
while( p < end ) {
const char *q;
for (q = p; q < end && *q != '%'; ++q)
{}
if( q != p ) { Cat( p, q - p ); }
if( q < end ) {
if( ++q < end ) { // skip %
if( isdigit(*q) ) { Cat( *q - '0' ); }
else { Cat( q, 1 ); }
}
p = q + 1;
}
else { break; }
}
}
STATIC_FXN void test_rpc( PCChar szRawReplace ) {
GenericList ResultStr;
const ReplaceWithCaptures rplc( szRawReplace ); // convert replace string into capture-handling representation
DLINKC_FIRST_TO_LASTA( rplc.d_head, dlink, pEl ) {
PCChar str;
if( pEl->IsStr( &str ) ) {
// buffer_addlstring (&ResultStr, str, num);
continue;
}
int num;
if( pEl->IsInt( &num ) ) {
// if( ALG_SUBVALID (ud,num) ) // backref exists?
// buffer_addlstring (&ResultStr, argE.text + ALG_BASE(pos) + ALG_SUBBEG(ud,num), ALG_SUBLEN(ud,num));
//
continue;
}
}
}