Skip to content

Commit

Permalink
SyntaxTokenizer improvements. Now it's possible to set captures to su…
Browse files Browse the repository at this point in the history
…b-syntaxes and ranges.

Fix in RegEx that prevented to ^ work properly in tokenizer.
  • Loading branch information
SpartanJ committed Mar 9, 2025
1 parent dfb0820 commit b84080f
Show file tree
Hide file tree
Showing 5 changed files with 133 additions and 42 deletions.
6 changes: 6 additions & 0 deletions include/eepp/system/patternmatcher.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -114,6 +114,12 @@ class EE_API PatternMatcher {
bool find( const char* stringSearch, int& startMatch, int& endMatch, int stringStartOffset = 0,
int stringLength = 0, int returnMatchIndex = 0 ) const;

bool find( const std::string& s, int& startMatch, int& endMatch, int offset,
int returnedMatchIndex, PatternMatcher::Range* matchesBuffer ) const;

bool find( const char* stringSearch, int& startMatch, int& endMatch, int stringStartOffset,
int stringLength, int returnMatchIndex, PatternMatcher::Range* matchesBuffer ) const;

std::string gsub( const char* text, const char* replace );

std::string gsub( const std::string& text, const std::string& replace );
Expand Down
19 changes: 19 additions & 0 deletions src/eepp/system/patternmatcher.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,19 @@ namespace EE { namespace System {

#define MAX_DEFAULT_MATCHES 12

bool PatternMatcher::find( const char* stringSearch, int& startMatch, int& endMatch,
int stringStartOffset, int stringLength, int returnMatchIndex,
PatternMatcher::Range* matchesBuffer ) const {
if ( matches( stringSearch, stringStartOffset, matchesBuffer, stringLength ) ) {
range( returnMatchIndex, startMatch, endMatch, matchesBuffer );
return true;
} else {
startMatch = -1;
endMatch = -1;
return false;
}
}

bool PatternMatcher::find( const char* stringSearch, int& startMatch, int& endMatch,
int stringStartOffset, int stringLength, int returnMatchIndex ) const {
PatternMatcher::Range matchesBuffer[MAX_DEFAULT_MATCHES];
Expand All @@ -22,6 +35,12 @@ bool PatternMatcher::find( const char* stringSearch, int& startMatch, int& endMa
}
}

bool PatternMatcher::find( const std::string& s, int& startMatch, int& endMatch, int offset,
int returnedMatchIndex, PatternMatcher::Range* matchesBuffer ) const {
return find( s.c_str(), startMatch, endMatch, offset, s.size(), returnedMatchIndex,
matchesBuffer );
}

bool PatternMatcher::find( const std::string& s, int& startMatch, int& endMatch, int offset,
int returnedMatchIndex ) const {
return find( s.c_str(), startMatch, endMatch, offset, s.size(), returnedMatchIndex );
Expand Down
18 changes: 9 additions & 9 deletions src/eepp/system/regex.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -88,13 +88,13 @@ bool RegEx::matches( const char* stringSearch, int stringStartOffset,

PCRE2_SPTR subject = reinterpret_cast<PCRE2_SPTR>( stringSearch );

int rc = pcre2_match( compiledPattern, // the compiled pattern
subject, // the subject string
stringLength, // the length of the subject
stringStartOffset, // start at offset in the subject
0, // default options
match_data, // match data
NULL // match context
int rc = pcre2_match( compiledPattern, // the compiled pattern
subject + stringStartOffset, // the subject string
stringLength - stringStartOffset, // the length of the subject
0, // start at offset in the subject
0, // default options
match_data, // match data
NULL // match context
);

if ( rc < 0 ) {
Expand All @@ -111,8 +111,8 @@ bool RegEx::matches( const char* stringSearch, int stringStartOffset,
if ( matchList != nullptr ) {
PCRE2_SIZE* ovector = pcre2_get_ovector_pointer( match_data );
for ( size_t i = 0; i < static_cast<size_t>( rc ); ++i ) {
matchList[i].start = static_cast<int>( ovector[2 * i] );
matchList[i].end = static_cast<int>( ovector[2 * i + 1] );
matchList[i].start = stringStartOffset + static_cast<int>( ovector[2 * i] );
matchList[i].end = stringStartOffset + static_cast<int>( ovector[2 * i + 1] );
if ( matchList[i].start >= matchList[i].end ) {
matchList[i].start = matchList[i].end = -1;
mMatchNum--;
Expand Down
8 changes: 4 additions & 4 deletions src/eepp/ui/doc/languages/cpp.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -11,10 +11,10 @@ void addCPP() {
{ "%.cpp$", "%.cc$", "%.cxx$", "%.c++$", "%.hh$", "%.inl$", "%.hxx$", "%.hpp$", "%.h++$",
"%.tcc$" },
{
{ { "R%\"xml%(", "%)xml%\"" }, "function", "XML" },
{ { "R%\"css%(", "%)css%\"" }, "function", "CSS" },
{ { "R%\"html%(", "%)html%\"" }, "function", "HTML" },
{ { "R%\"json%(", "%)json%\"" }, "function", "JSON" },
{ { "R%\"(xml)%(", "%)(xml)%\"" }, { "string", "keyword2", "keyword2" }, "XML" },
{ { "R%\"(css)%(", "%)(css)%\"" }, { "string", "keyword2", "keyword2" }, "CSS" },
{ { "R%\"(html)%(", "%)(html)%\"" }, { "string", "keyword2", "keyword2" }, "HTML" },
{ { "R%\"(json)%(", "%)(json)%\"" }, { "string", "keyword2", "keyword2" }, "JSON" },
{ { "R\"[%a-\"]+%(", "%)[%a-\"]+%\"" }, "string" },
{ { "R\"%(", "%)\"" }, "string" },
{ { "//.-\n" }, "comment" },
Expand Down
124 changes: 95 additions & 29 deletions src/eepp/ui/doc/syntaxtokenizer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -98,26 +98,36 @@ static bool isScaped( const std::string& text, const size_t& startIndex,
return count % 2 == 1;
}

static std::pair<int, int> findNonEscaped( const std::string& text, const std::string& pattern,
int offset, const std::string& escapeStr,
bool isRegEx ) {
struct NonEscapedMatch {
std::pair<int, int> range{ -1, -1 };
PatternMatcher::Range matches[6];
int numMatches{ 0 };
};

static NonEscapedMatch findNonEscaped( const std::string& text, const std::string& pattern,
int offset, const std::string& escapeStr, bool isRegEx ) {
eeASSERT( !pattern.empty() );
if ( pattern.empty() )
return std::make_pair( -1, -1 );
return {};
std::variant<RegEx, LuaPattern> wordsVar =
isRegEx ? std::variant<RegEx, LuaPattern>( RegEx( pattern ) )
: std::variant<RegEx, LuaPattern>( LuaPattern( pattern ) );
PatternMatcher& words =
std::visit( []( auto& patternType ) -> PatternMatcher& { return patternType; }, wordsVar );
int start, end;
while ( words.find( text, start, end, offset ) ) {
PatternMatcher::Range matches[6];
while ( words.find( text, start, end, offset, 0, matches ) ) {
if ( !escapeStr.empty() && isScaped( text, start, escapeStr ) ) {
offset = end;
} else {
return std::make_pair( start, end );
NonEscapedMatch res;
res.range = { start, end };
res.numMatches = words.getNumMatches();
std::memcpy( res.matches, matches, sizeof( matches ) );
return res;
}
}
return std::make_pair( -1, -1 );
return {};
}

SyntaxStateRestored SyntaxTokenizer::retrieveSyntaxState( const SyntaxDefinition& syntax,
Expand Down Expand Up @@ -184,6 +194,55 @@ static inline void popSubsyntax( SyntaxStateRestored& curState, SyntaxState& ret
curState = SyntaxTokenizer::retrieveSyntaxState( syntax, retState );
};

template <typename T>
static inline void
pushTokensToOpenCloseSubsyntax( int i, std::string_view textv, const SyntaxPattern* subsyntaxInfo,
const NonEscapedMatch& rangeSubsyntax, std::vector<T>& tokens ) {
if ( rangeSubsyntax.numMatches > 1 ) {
int patternMatchStart = rangeSubsyntax.matches[0].start;
int patternMatchEnd = rangeSubsyntax.matches[0].end;
auto patternType = subsyntaxInfo->types[0];
int lastStart = patternMatchStart;
int lastEnd = patternMatchEnd;

if ( i < patternMatchStart )
pushToken( tokens, patternType, textv.substr( i, patternMatchStart - i ) );

int start;
int end;

for ( int sidx = 1; sidx < rangeSubsyntax.numMatches; sidx++ ) {
start = rangeSubsyntax.matches[sidx].start;
end = rangeSubsyntax.matches[sidx].end;

if ( sidx == 1 && start > lastStart ) {
pushToken( tokens, patternType,
textv.substr( patternMatchStart, start - patternMatchStart ) );
} else if ( start > lastEnd ) {
pushToken( tokens, patternType, textv.substr( lastEnd, start - lastEnd ) );
}

auto ss{ textv.substr( start, end - start ) };

pushToken( tokens,
sidx < static_cast<int>( subsyntaxInfo->types.size() )
? subsyntaxInfo->types[sidx]
: subsyntaxInfo->types[0],
ss );

if ( sidx == rangeSubsyntax.numMatches - 1 && end < patternMatchEnd ) {
pushToken( tokens, patternType, textv.substr( end, patternMatchEnd - end ) );
}

lastStart = start;
lastEnd = end;
}
} else {
pushToken( tokens, subsyntaxInfo->types[0],
textv.substr( i, rangeSubsyntax.range.second - i ) );
}
}

template <typename T>
static inline std::pair<std::vector<T>, SyntaxState>
_tokenize( const SyntaxDefinition& syntax, const std::string& text, const SyntaxState& state,
Expand Down Expand Up @@ -211,28 +270,29 @@ _tokenize( const SyntaxDefinition& syntax, const std::string& text, const Syntax
if ( curState.currentPatternIdx != SYNTAX_TOKENIZER_STATE_NONE ) {
const SyntaxPattern& pattern =
curState.currentSyntax->getPatterns()[curState.currentPatternIdx - 1];
std::pair<int, int> range = findNonEscaped(
text, pattern.patterns[1], i,
pattern.patterns.size() >= 3 ? pattern.patterns[2] : "", pattern.isRegEx );
auto range = findNonEscaped( text, pattern.patterns[1], i,
pattern.patterns.size() >= 3 ? pattern.patterns[2] : "",
pattern.isRegEx )
.range;

bool skip = false;

if ( curState.subsyntaxInfo != nullptr ) {
std::pair<int, int> rangeSubsyntax =
findNonEscaped( text, curState.subsyntaxInfo->patterns[1], i,
curState.subsyntaxInfo->patterns.size() >= 3
? curState.subsyntaxInfo->patterns[2]
: "",
pattern.isRegEx );
if ( curState.subsyntaxInfo != nullptr &&
curState.subsyntaxInfo->patterns.size() > 1 ) {
auto rangeSubsyntax = findNonEscaped( text, curState.subsyntaxInfo->patterns[1], i,
curState.subsyntaxInfo->patterns.size() >= 3
? curState.subsyntaxInfo->patterns[2]
: "",
pattern.isRegEx );

if ( rangeSubsyntax.first != -1 &&
( range.first == -1 || rangeSubsyntax.first < range.first ) ) {
if ( rangeSubsyntax.range.first != -1 &&
( range.first == -1 || rangeSubsyntax.range.first < range.first ) ) {
if ( !skipSubSyntaxSeparator ) {
pushToken( tokens, curState.subsyntaxInfo->types[0],
textv.substr( i, rangeSubsyntax.second - i ) );
pushTokensToOpenCloseSubsyntax( i, textv, curState.subsyntaxInfo,
rangeSubsyntax, tokens );
}
popSubsyntax( curState, retState, syntax );
i = rangeSubsyntax.second;
i = rangeSubsyntax.range.second;
skip = true;
}
}
Expand All @@ -255,20 +315,20 @@ _tokenize( const SyntaxDefinition& syntax, const std::string& text, const Syntax
}
}

if ( curState.subsyntaxInfo != nullptr ) {
std::pair<int, int> rangeSubsyntax = findNonEscaped(
if ( curState.subsyntaxInfo != nullptr && curState.subsyntaxInfo->patterns.size() > 1 ) {
auto rangeSubsyntax = findNonEscaped(
text, "^" + curState.subsyntaxInfo->patterns[1], i,
curState.subsyntaxInfo->patterns.size() >= 3 ? curState.subsyntaxInfo->patterns[2]
: "",
curState.subsyntaxInfo->isRegEx );

if ( rangeSubsyntax.first != -1 ) {
if ( rangeSubsyntax.range.first != -1 ) {
if ( !skipSubSyntaxSeparator ) {
pushToken( tokens, curState.subsyntaxInfo->types[0],
textv.substr( i, rangeSubsyntax.second - i ) );
pushTokensToOpenCloseSubsyntax( i, textv, curState.subsyntaxInfo,
rangeSubsyntax, tokens );
}
popSubsyntax( curState, retState, syntax );
i = rangeSubsyntax.second;
i = rangeSubsyntax.range.second;
}
}

Expand Down Expand Up @@ -334,7 +394,8 @@ _tokenize( const SyntaxDefinition& syntax, const std::string& text, const Syntax
patternText );
}

if ( pattern.hasSyntax() ) {
if ( pattern.hasSyntax() && curMatch == numMatches - 1 &&
end == patternMatchEnd ) {
pushSubsyntax( curState, retState, pattern, patternIndex + 1,
patternStr );
} else if ( pattern.patterns.size() > 1 ) {
Expand All @@ -347,6 +408,11 @@ _tokenize( const SyntaxDefinition& syntax, const std::string& text, const Syntax
pushToken( tokens, patternType,
textv.substr( end, patternMatchEnd - end ) );
i = patternMatchEnd;

if ( pattern.hasSyntax() && curMatch == numMatches - 1 ) {
pushSubsyntax( curState, retState, pattern, patternIndex + 1,
patternStr );
}
}

matched = true;
Expand Down

0 comments on commit b84080f

Please sign in to comment.