Skip to content

Commit

Permalink
Refactor regexp and regexp_like function (#6082)
Browse files Browse the repository at this point in the history
close #5984, ref #6115
  • Loading branch information
xzhangxian1008 authored Nov 7, 2022
1 parent 070c466 commit 60f5dce
Show file tree
Hide file tree
Showing 13 changed files with 2,099 additions and 716 deletions.
2 changes: 1 addition & 1 deletion dbms/src/Flash/Coprocessor/DAGUtils.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -433,7 +433,7 @@ const std::unordered_map<tipb::ScalarFuncSig, String> scalar_func_map({
{tipb::ScalarFuncSig::LikeSig, "like3Args"},
{tipb::ScalarFuncSig::RegexpSig, "regexp"},
{tipb::ScalarFuncSig::RegexpUTF8Sig, "regexp"},
{tipb::ScalarFuncSig::RegexpLikeSig, "regexp"},
{tipb::ScalarFuncSig::RegexpLikeSig, "regexp_like"},
// {tipb::ScalarFuncSig::RegexpInStrSig, "regexp_instr"},
// {tipb::ScalarFuncSig::RegexpReplaceSig, "regexp_replace"},
// {tipb::ScalarFuncSig::RegexpSubstrSig, "regexp_substr"},
Expand Down
300 changes: 300 additions & 0 deletions dbms/src/Functions/FunctionsRegexp.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,300 @@
// Copyright 2022 PingCAP, Ltd.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

#include <Columns/ColumnNullable.h>
#include <Functions/FunctionFactory.h>
#include <Functions/FunctionsRegexp.h>
#include <Functions/Regexps.h>
#include <fmt/core.h>

namespace DB
{
/** Replace all matches of regexp 'needle' to string 'replacement'. 'needle' and 'replacement' are constants.
* 'replacement' could contain substitutions, for example: '\2-\3-\1'
*/
template <bool replace_one = false>
struct ReplaceRegexpImpl
{
static constexpr bool support_non_const_needle = false;
static constexpr bool support_non_const_replacement = false;
/// need customized escape char when do the string search
static const bool need_customized_escape_char = false;
/// support match type when do the string search, used in regexp
static const bool support_match_type = true;

/// Sequence of instructions, describing how to get resulting string.
/// Each element is either:
/// - substitution (in that case first element of pair is their number and second element is empty)
/// - string that need to be inserted (in that case, first element of pair is -1 and second element is that string)
using Instructions = std::vector<std::pair<int, std::string>>;

static const size_t max_captures = 10;

static Instructions createInstructions(const std::string & s, int num_captures)
{
Instructions instructions;

String now;
for (size_t i = 0; i < s.size(); ++i)
{
if (s[i] == '\\' && i + 1 < s.size())
{
if (isNumericASCII(s[i + 1])) /// Substitution
{
if (!now.empty())
{
instructions.emplace_back(-1, now);
now = "";
}
instructions.emplace_back(s[i + 1] - '0', String());
}
else
now += s[i + 1]; /// Escaping
++i;
}
else
now += s[i]; /// Plain character
}

if (!now.empty())
{
instructions.emplace_back(-1, now);
now = "";
}

for (const auto & it : instructions)
if (it.first >= num_captures)
throw Exception("Invalid replace instruction in replacement string. Id: " + toString(it.first) + ", but regexp has only "
+ toString(num_captures - 1)
+ " subpatterns",
ErrorCodes::BAD_ARGUMENTS);

return instructions;
}


static void processString(const re2_st::StringPiece & input,
ColumnString::Chars_t & res_data,
ColumnString::Offset & res_offset,
const Int64 & pos,
const Int64 & occ,
re2_st::RE2 & searcher,
int num_captures,
const Instructions & instructions)
{
re2_st::StringPiece matches[max_captures];

size_t start_pos = pos <= 0 ? 0 : pos - 1;
Int64 match_occ = 0;
size_t prefix_length = std::min(start_pos, static_cast<size_t>(input.length()));
if (prefix_length > 0)
{
/// Copy prefix
res_data.resize(res_data.size() + prefix_length);
memcpySmallAllowReadWriteOverflow15(&res_data[res_offset], input.data(), prefix_length);
res_offset += prefix_length;
}
while (start_pos < static_cast<size_t>(input.length()))
{
/// If no more replacements possible for current string
bool can_finish_current_string = false;

if (searcher.Match(input, start_pos, input.length(), re2_st::RE2::Anchor::UNANCHORED, matches, num_captures))
{
match_occ++;
/// if occ > 0, it will replace all the match expr, otherwise it only replace the occ-th match
if (occ == 0 || match_occ == occ)
{
const auto & match = matches[0];
size_t bytes_to_copy = (match.data() - input.data()) - start_pos;

/// Copy prefix before matched regexp without modification
res_data.resize(res_data.size() + bytes_to_copy);
memcpySmallAllowReadWriteOverflow15(&res_data[res_offset], input.data() + start_pos, bytes_to_copy);
res_offset += bytes_to_copy;
start_pos += bytes_to_copy + match.length();

/// Do substitution instructions
for (const auto & it : instructions)
{
if (it.first >= 0)
{
res_data.resize(res_data.size() + matches[it.first].length());
memcpy(&res_data[res_offset], matches[it.first].data(), matches[it.first].length());
res_offset += matches[it.first].length();
}
else
{
res_data.resize(res_data.size() + it.second.size());
memcpy(&res_data[res_offset], it.second.data(), it.second.size());
res_offset += it.second.size();
}
}

/// when occ > 0, just replace the occ-th match even if replace_one is false
if (replace_one || match.length() == 0) /// Stop after match of zero length, to avoid infinite loop.
can_finish_current_string = true;
}
else
{
const auto & match = matches[0];
size_t bytes_to_copy = (match.data() - input.data()) - start_pos + match.length();

/// Copy the matched string without modification
res_data.resize(res_data.size() + bytes_to_copy);
memcpySmallAllowReadWriteOverflow15(&res_data[res_offset], input.data() + start_pos, bytes_to_copy);
res_offset += bytes_to_copy;
start_pos += bytes_to_copy;
if (match.length() == 0)
can_finish_current_string = true;
}
}
else
can_finish_current_string = true;

/// If ready, append suffix after match to end of string.
if (can_finish_current_string)
{
res_data.resize(res_data.size() + input.length() - start_pos);
memcpySmallAllowReadWriteOverflow15(&res_data[res_offset], input.data() + start_pos, input.length() - start_pos);
res_offset += input.length() - start_pos;
start_pos = input.length();
}
}

res_data.resize(res_data.size() + 1);
res_data[res_offset] = 0;
++res_offset;
}


static void vector(const ColumnString::Chars_t & data,
const ColumnString::Offsets & offsets,
const std::string & needle,
const std::string & replacement,
const Int64 & pos,
const Int64 & occ,
const std::string & match_type,
TiDB::TiDBCollatorPtr collator,
ColumnString::Chars_t & res_data,
ColumnString::Offsets & res_offsets)
{
ColumnString::Offset res_offset = 0;
res_data.reserve(data.size());
size_t size = offsets.size();
res_offsets.resize(size);

if (needle.empty())
{
/// Copy all the data without changing.
res_data.resize(data.size());
const UInt8 * begin = &data[0];
memcpy(&res_data[0], begin, data.size());
memcpy(&res_offsets[0], &offsets[0], size * sizeof(UInt64));
return;
}

String updated_needle = needle;
if (!match_type.empty() || collator != nullptr)
{
String mode_modifiers = re2Util::getRE2ModeModifiers(match_type, collator);
if (!mode_modifiers.empty())
updated_needle = mode_modifiers + updated_needle;
}
re2_st::RE2 searcher(updated_needle);
int num_captures = std::min(searcher.NumberOfCapturingGroups() + 1, static_cast<int>(max_captures));

Instructions instructions = createInstructions(replacement, num_captures);

/// Cannot perform search for whole block. Will process each string separately.
for (size_t i = 0; i < size; ++i)
{
int from = i > 0 ? offsets[i - 1] : 0;
re2_st::StringPiece input(reinterpret_cast<const char *>(&data[0] + from), offsets[i] - from - 1);

processString(input, res_data, res_offset, pos, occ, searcher, num_captures, instructions);
res_offsets[i] = res_offset;
}
}

static void vectorFixed(const ColumnString::Chars_t & data,
size_t n,
const std::string & needle,
const std::string & replacement,
const Int64 & pos,
const Int64 & occ,
const std::string & match_type,
TiDB::TiDBCollatorPtr collator,
ColumnString::Chars_t & res_data,
ColumnString::Offsets & res_offsets)
{
ColumnString::Offset res_offset = 0;
size_t size = data.size() / n;
res_data.reserve(data.size());
res_offsets.resize(size);

if (needle.empty())
{
/// TODO: copy all the data without changing
throw Exception("Length of the second argument of function replace must be greater than 0.", ErrorCodes::ARGUMENT_OUT_OF_BOUND);
}

String updated_needle = needle;
if (!match_type.empty() || collator != nullptr)
{
String mode_modifiers = re2Util::getRE2ModeModifiers(match_type, collator);
if (!mode_modifiers.empty())
updated_needle = mode_modifiers + updated_needle;
}
re2_st::RE2 searcher(updated_needle);
int num_captures = std::min(searcher.NumberOfCapturingGroups() + 1, static_cast<int>(max_captures));

Instructions instructions = createInstructions(replacement, num_captures);

for (size_t i = 0; i < size; ++i)
{
int from = i * n;
re2_st::StringPiece input(reinterpret_cast<const char *>(&data[0] + from), n);

processString(input, res_data, res_offset, pos, occ, searcher, num_captures, instructions);
res_offsets[i] = res_offset;
}
}
static void constant(const String & input, const String & needle, const String & replacement, const Int64 & pos, const Int64 & occ, const String & match_type, TiDB::TiDBCollatorPtr collator, String & output)
{
ColumnString::Chars_t input_data;
input_data.insert(input_data.end(), input.begin(), input.end());
ColumnString::Offsets input_offsets;
input_offsets.push_back(input_data.size() + 1);
ColumnString::Chars_t output_data;
ColumnString::Offsets output_offsets;
vector(input_data, input_offsets, needle, replacement, pos, occ, match_type, collator, output_data, output_offsets);
output = String(reinterpret_cast<const char *>(&output_data[0]), output_offsets[0] - 1);
}
};

using FunctionTiDBRegexp = FunctionStringRegexp<NameTiDBRegexp>;
using FunctionRegexpLike = FunctionStringRegexp<NameRegexpLike>;
using FunctionReplaceRegexpOne = FunctionStringReplace<ReplaceRegexpImpl<true>, NameReplaceRegexpOne>;
using FunctionReplaceRegexpAll = FunctionStringReplace<ReplaceRegexpImpl<false>, NameReplaceRegexpAll>;

void registerFunctionsRegexp(FunctionFactory & factory)
{
factory.registerFunction<FunctionReplaceRegexpOne>();
factory.registerFunction<FunctionReplaceRegexpAll>();
factory.registerFunction<FunctionTiDBRegexp>();
factory.registerFunction<FunctionRegexpLike>();
}

} // namespace DB
Loading

0 comments on commit 60f5dce

Please sign in to comment.